From e4001271d76b4a1b7be5d6dddba1f509fb9fb799 Mon Sep 17 00:00:00 2001 From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com> Date: Wed, 3 Dec 2025 22:17:13 -0500 Subject: [PATCH 01/11] update batch texture --- include/madrona/importer.hpp | 9 +- src/bridge/bindings.cpp | 120 +++--- src/bridge/mgr.cpp | 102 +++-- src/bridge/mgr.hpp | 3 +- src/importer/gltf.cpp | 38 +- src/render/batch_renderer.cpp | 82 ++-- src/render/render_ctx.cpp | 462 +++++++++-------------- src/render/render_ctx.hpp | 1 - src/render/shaders/batch_draw_depth.hlsl | 15 +- src/render/shaders/batch_draw_rgb.hlsl | 124 +++--- src/render/shaders/draw_gbuffer.hlsl | 10 +- src/render/shaders/shader_common.h | 8 +- src/render/shaders/shader_utils.hlsl | 1 - src/render/shaders/viewer_cull.hlsl | 6 +- src/render/shaders/viewer_draw.hlsl | 81 ++-- src/render/shaders/voxel_draw.hlsl | 3 + 16 files changed, 443 insertions(+), 622 deletions(-) diff --git a/include/madrona/importer.hpp b/include/madrona/importer.hpp index 8d9f9265..731035f4 100644 --- a/include/madrona/importer.hpp +++ b/include/madrona/importer.hpp @@ -47,8 +47,8 @@ struct SourceMaterial { // If this is -1, no texture will be applied. Otherwise, // the color gets multipled by color of the texture read in // at the UVs of the pixel. - int32_t textureIdx; - + int32_t *textureIdx; + uint32_t numTextures; float roughness; float metalness; }; @@ -66,8 +66,7 @@ class ImageImporter { ImageImporter(ImageImporter &&); ~ImageImporter(); - using ImportHandler = - Optional (*)(void *data, size_t num_bytes); + using ImportHandler = Optional (*)(void *data, size_t num_bytes); int32_t addHandler(const char *extension, ImportHandler fn); @@ -106,9 +105,9 @@ struct ImportedAssets { DynArray objects; DynArray materials; + DynArray materialTextures; DynArray instances; DynArray textures; - }; class AssetImporter { diff --git a/src/bridge/bindings.cpp b/src/bridge/bindings.cpp index 30e1ce12..e5948391 100644 --- a/src/bridge/bindings.cpp +++ b/src/bridge/bindings.cpp @@ -23,59 +23,33 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { .def("__init__", []( Manager *self, int64_t gpu_id, - nb::ndarray, - nb::device::cpu> mesh_vertices, - nb::ndarray, - nb::device::cpu> mesh_faces, - nb::ndarray, - nb::device::cpu> mesh_vertex_offsets, - nb::ndarray, - nb::device::cpu> mesh_face_offsets, - nb::ndarray, - nb::device::cpu> mesh_texcoords, - nb::ndarray, - nb::device::cpu> mesh_texcoord_offsets, - nb::ndarray, - nb::device::cpu> mesh_texcoord_num, - nb::ndarray, - nb::device::cpu> geom_types, - nb::ndarray, - nb::device::cpu> geom_groups, - nb::ndarray, - nb::device::cpu> geom_data_ids, - nb::ndarray, - nb::device::cpu> geom_sizes, - nb::ndarray, - nb::device::cpu> geom_mat_ids, - nb::ndarray, - nb::device::cpu> geom_rgba, - nb::ndarray, - nb::device::cpu> mat_rgba, - nb::ndarray, - nb::device::cpu> mat_tex_ids, - nb::ndarray, - nb::device::cpu> tex_data, - nb::ndarray, - nb::device::cpu> tex_offsets, - nb::ndarray, - nb::device::cpu> tex_widths, - nb::ndarray, - nb::device::cpu> tex_heights, - nb::ndarray, - nb::device::cpu> tex_nchans, + nb::ndarray, nb::device::cpu> mesh_vertices, + nb::ndarray, nb::device::cpu> mesh_faces, + nb::ndarray, nb::device::cpu> mesh_vertex_offsets, + nb::ndarray, nb::device::cpu> mesh_face_offsets, + nb::ndarray, nb::device::cpu> mesh_texcoords, + nb::ndarray, nb::device::cpu> mesh_texcoord_offsets, + nb::ndarray, nb::device::cpu> geom_types, + nb::ndarray, nb::device::cpu> geom_groups, + nb::ndarray, nb::device::cpu> geom_data_ids, + nb::ndarray, nb::device::cpu> geom_sizes, + nb::ndarray, nb::device::cpu> geom_mat_ids, + nb::ndarray, nb::device::cpu> mat_rgba, + nb::ndarray, nb::device::cpu> mat_tex_ids, + nb::ndarray, nb::device::cpu> mat_tex_offsets, + nb::ndarray, nb::device::cpu> tex_data, + nb::ndarray, nb::device::cpu> tex_offsets, + nb::ndarray, nb::device::cpu> tex_widths, + nb::ndarray, nb::device::cpu> tex_heights, + nb::ndarray, nb::device::cpu> tex_nchans, int64_t num_lights, - int64_t num_cams, int64_t num_worlds, int64_t batch_render_view_width, int64_t batch_render_view_height, - nb::ndarray, - nb::device::cpu> cam_fovy, - nb::ndarray, - nb::device::cpu> cam_znear, - nb::ndarray, - nb::device::cpu> cam_zfar, - nb::ndarray, - nb::device::cpu> enabled_geom_groups, + nb::ndarray, nb::device::cpu> cam_fovy, + nb::ndarray, nb::device::cpu> cam_znear, + nb::ndarray, nb::device::cpu> cam_zfar, + nb::ndarray, nb::device::cpu> enabled_geom_groups, bool add_cam_debug_geo, bool use_rt, VisualizerGPUHandles *viz_gpu_hdls) @@ -87,21 +61,15 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { .triOffsets = (uint32_t *)mesh_face_offsets.data(), .texCoords = (math::Vector2 *)mesh_texcoords.data(), .texCoordOffsets = (int32_t *)mesh_texcoord_offsets.data(), - .texCoordNum = (uint32_t *)mesh_texcoord_num.data(), .numVertices = (uint32_t)mesh_vertices.shape(0), .numTris = (uint32_t)mesh_faces.shape(0), .numMeshes = (uint32_t)mesh_vertex_offsets.shape(0), }; // We need to make some copies because mgr.cpp will override - math::Vector4 *ptr_geom_rgba = (math::Vector4 *)malloc( - sizeof(math::Vector4) * geom_rgba.shape(0)); - int32_t *ptr_geom_mat_ids = (int32_t *)malloc( - sizeof(int32_t) * geom_mat_ids.shape(0)); - int32_t *ptr_geom_data_ids = (int32_t *)malloc( - sizeof(int32_t) * geom_data_ids.shape(0)); + int32_t *ptr_geom_mat_ids = (int32_t *)malloc(sizeof(int32_t) * geom_mat_ids.shape(0)); + int32_t *ptr_geom_data_ids = (int32_t *)malloc(sizeof(int32_t) * geom_data_ids.shape(0)); - memcpy(ptr_geom_rgba, geom_rgba.data(), sizeof(math::Vector4) * geom_rgba.shape(0)); memcpy(ptr_geom_mat_ids, geom_mat_ids.data(), sizeof(int32_t) * geom_mat_ids.shape(0)); memcpy(ptr_geom_data_ids, geom_data_ids.data(), sizeof(int32_t) * geom_data_ids.shape(0)); @@ -109,13 +77,13 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { .meshGeo = mesh_geo, .geomTypes = (int32_t *)geom_types.data(), .geomGroups = (int32_t *)geom_groups.data(), - .geomDataIDs = ptr_geom_data_ids,//(int32_t *)geom_data_ids.data(), - .geomMatIDs = ptr_geom_mat_ids,//(int32_t *)geom_mat_ids.data(), + .geomDataIDs = ptr_geom_data_ids, // (int32_t *)geom_data_ids.data(), + .geomMatIDs = ptr_geom_mat_ids, // (int32_t *)geom_mat_ids.data(), .enabledGeomGroups = (int32_t *)enabled_geom_groups.data(), .geomSizes = (math::Vector3 *)geom_sizes.data(), - .geomRGBA = ptr_geom_rgba,//(math::Vector4 *)geom_rgba.data(), .matRGBA = (math::Vector4 *)mat_rgba.data(), .matTexIDs = (int32_t *)mat_tex_ids.data(), + .matTexOffsets = (int32_t *)mat_tex_offsets.data(), .texData = (uint8_t *) tex_data.data(), .texOffsets = (int64_t *)tex_offsets.data(), .texWidths = (int32_t *)tex_widths.data(), @@ -123,8 +91,9 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { .texNChans = (int32_t *)tex_nchans.data(), .numGeoms = (uint32_t)geom_types.shape(0), .numMats = (uint32_t)mat_rgba.shape(0), + .numMatTextures = (uint32_t)mat_tex_ids.shape(0), .numTextures = (uint32_t)tex_offsets.shape(0), - .numCams = (uint32_t)num_cams, + .numCams = (uint32_t)cam_fovy.shape(0), .numLights = (uint32_t)num_lights, .numEnabledGeomGroups = (uint32_t)enabled_geom_groups.shape(0), .camFovy = (float *)cam_fovy.data(), @@ -132,15 +101,18 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { .camZFar = (float *)cam_zfar.data(), }; - new (self) Manager(Manager::Config { - .gpuID = (int)gpu_id, - .numWorlds = (uint32_t)num_worlds, - .batchRenderViewWidth = (uint32_t)batch_render_view_width, - .batchRenderViewHeight = (uint32_t)batch_render_view_height, - .addCamDebugGeometry = add_cam_debug_geo, - .useRT = use_rt, - }, gs_model, viz_gpu_hdls != nullptr ? *viz_gpu_hdls : - Optional::none()); + new (self) Manager( + Manager::Config { + .gpuID = (int)gpu_id, + .numWorlds = (uint32_t)num_worlds, + .batchRenderViewWidth = (uint32_t)batch_render_view_width, + .batchRenderViewHeight = (uint32_t)batch_render_view_height, + .addCamDebugGeometry = add_cam_debug_geo, + .useRT = use_rt, + }, + gs_model, + viz_gpu_hdls != nullptr ? *viz_gpu_hdls : Optional::none() + ); free(ptr_geom_rgba); free(ptr_geom_mat_ids); @@ -152,22 +124,20 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { nb::arg("mesh_face_offsets"), nb::arg("mesh_texcoords"), nb::arg("mesh_texcoord_offsets"), - nb::arg("mesh_texcoord_num"), nb::arg("geom_types"), nb::arg("geom_groups"), nb::arg("geom_data_ids"), nb::arg("geom_sizes"), nb::arg("geom_mat_ids"), - nb::arg("geom_rgba"), nb::arg("mat_rgba"), nb::arg("mat_tex_ids"), + nb::arg("mat_tex_offsets"), nb::arg("tex_data"), nb::arg("tex_offsets"), nb::arg("tex_widths"), nb::arg("tex_heights"), nb::arg("tex_nchans"), nb::arg("num_lights"), - nb::arg("num_cams"), nb::arg("num_worlds"), nb::arg("batch_render_view_width"), nb::arg("batch_render_view_height"), @@ -185,7 +155,7 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { nb::ndarray> cam_pos, nb::ndarray> cam_rot, nb::ndarray> mat_ids, - nb::ndarray> geom_rgb, + // nb::ndarray> geom_rgb, nb::ndarray> geom_sizes, nb::ndarray> light_pos, nb::ndarray> light_dir, @@ -203,7 +173,7 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { reinterpret_cast(cam_pos.data()), reinterpret_cast(cam_rot.data()), reinterpret_cast(mat_ids.data()), - reinterpret_cast(geom_rgb.data()), + // reinterpret_cast(geom_rgb.data()), reinterpret_cast(geom_sizes.data()), reinterpret_cast(light_pos.data()), reinterpret_cast(light_dir.data()), diff --git a/src/bridge/mgr.cpp b/src/bridge/mgr.cpp index 7945dedf..167bb256 100644 --- a/src/bridge/mgr.cpp +++ b/src/bridge/mgr.cpp @@ -265,11 +265,8 @@ struct Manager::Impl { const float *light_attenuation, const float *light_intensity) { - MWCudaLaunchGraph init_graph = - gpuExec.buildLaunchGraph(TaskGraphID::Init); - - MWCudaLaunchGraph render_init_graph = - gpuExec.buildLaunchGraph(TaskGraphID::RenderInit); + MWCudaLaunchGraph init_graph = gpuExec.buildLaunchGraph(TaskGraphID::Init); + MWCudaLaunchGraph render_init_graph = gpuExec.buildLaunchGraph(TaskGraphID::RenderInit); gpuExec.run(init_graph); @@ -418,16 +415,15 @@ static RTAssets loadRenderObjects( }, .objects { 0 }, .materials { 0 }, + .materialTextures { 0 }, .instances { 0 }, .textures { 0 }, }; - HeapArray meshes( - model.meshGeo.numMeshes + (size_t)RenderPrimObjectIDs::NumPrims); + HeapArray meshes(model.meshGeo.numMeshes + (size_t)RenderPrimObjectIDs::NumPrims); const CountT num_meshes = (CountT)model.meshGeo.numMeshes; - meshes[(size_t)RenderPrimObjectIDs::DebugCam] = - disk_render_assets->objects[(size_t)RenderPrimObjectIDs::DebugCam].meshes[0]; + meshes[(size_t)RenderPrimObjectIDs::DebugCam] = disk_render_assets->objects[(size_t)RenderPrimObjectIDs::DebugCam].meshes[0]; meshes[(size_t)RenderPrimObjectIDs::Plane] = CreatePlane(generated_assets); meshes[(size_t)RenderPrimObjectIDs::Sphere] = CreateSphere(generated_assets); meshes[(size_t)RenderPrimObjectIDs::Box] = CreateBox(generated_assets); @@ -475,9 +471,10 @@ static RTAssets loadRenderObjects( SourceTexture *out_textures = tmp_alloc.allocN(model.numTextures); for (CountT i = 0; i < model.numTextures; i++) { + // TODO: NChans is not used. uint64_t tex_offset = model.texOffsets[i]; Optional tex = SourceTexture { - .data = &model.texData[tex_offset], + .data = model.texData + tex_offset, .format = SourceTextureFormat::R8G8B8A8, .width = (uint32_t)model.texWidths[i], .height = (uint32_t)model.texHeights[i], @@ -490,25 +487,30 @@ static RTAssets loadRenderObjects( std::vector materials; for (CountT i = 0; i < model.numMats; i++) { - int32_t tex_idx = model.matTexIDs[i * 10]; + const math::Vector4 &rgba = model.matRGBA[i]; + uint32_t mat_tex_offset = model.matTexOffsets[i]; + uint32_t next_tex_offset = i < model.numMats - 1 ? + model.matTexOffsets[i + 1] : model.numMatTextures; + uint32_t mat_tex_num = next_tex_offset - mat_tex_offset; + SourceMaterial mat = { - .color = math::Vector4{ - model.matRGBA[i].x, model.matRGBA[i].y, - model.matRGBA[i].z, model.matRGBA[i].w}, - .textureIdx = tex_idx, + .color = math::Vector4{rgba.x, rgba.y, rgba.z, rgba.w}, + .textureIdx = model.matTexIDs + mat_tex_offset, + .numTextures = mat_tex_num, .roughness = 0.0f, - .metalness = 0.0f}; + .metalness = 0.0f + }; materials.push_back(mat); } // Create materials for geoms that do not have one assigned for (CountT i = 0; i < model.numGeoms; i++) { if (model.geomMatIDs[i] == -1) { + const math::Vector4 &rgba_i = model.geomRGBA[i]; SourceMaterial mat = { - .color = math::Vector4{ - model.geomRGBA[i].x, model.geomRGBA[i].y, - model.geomRGBA[i].z, model.geomRGBA[i].w}, - .textureIdx = -1, + .color = math::Vector4{rgba_i.x, rgba_i.y, rgba_i.z, rgba_i.w}, + .textureIdx = nullptr, + .numTextures = 0, .roughness = 0.8f, .metalness = 0.2f, }; @@ -517,12 +519,13 @@ static RTAssets loadRenderObjects( for (CountT j = i + 1; j < model.numGeoms; j++) { // FIX: Should probably implement == op for Vector4 + const math::Vector4 &rgba_j = model.geomRGBA[j]; if (model.geomMatIDs[j] == -1 && - model.geomRGBA[i].x == model.geomRGBA[j].x && - model.geomRGBA[i].y == model.geomRGBA[j].y && - model.geomRGBA[i].z == model.geomRGBA[j].z && - model.geomRGBA[i].w == model.geomRGBA[j].w) - { + rgba_i.x == rgba_j.x && + rgba_i.y == rgba_j.y && + rgba_i.z == rgba_j.z && + rgba_i.w == rgba_j.w + ) { model.geomMatIDs[j] = materials.size() - 1; } } @@ -643,44 +646,30 @@ Manager::Impl * Manager::Impl::make( sim_cfg.useRT = use_rt; CUcontext cu_ctx = MWCudaExecutor::initCUDA(mgr_cfg.gpuID); - - Optional render_gpu_state = - initRenderGPUState(mgr_cfg, viz_gpu_hdls); - + Optional render_gpu_state = initRenderGPUState(mgr_cfg, viz_gpu_hdls); Optional render_mgr = - initRenderManager(mgr_cfg, gs_model, - viz_gpu_hdls, render_gpu_state); + initRenderManager(mgr_cfg, gs_model, viz_gpu_hdls, render_gpu_state); - RTAssets rt_assets = loadRenderObjects( - gs_model, render_mgr, use_rt); + RTAssets rt_assets = loadRenderObjects(gs_model, render_mgr, use_rt); if (render_mgr.has_value()) { sim_cfg.renderBridge = render_mgr->bridge(); } else { sim_cfg.renderBridge = nullptr; } - int32_t *geom_types_gpu = (int32_t *)cu::allocGPU( - sizeof(int32_t) * gs_model.numGeoms); - int32_t *geom_data_ids_gpu = (int32_t *)cu::allocGPU( - sizeof(int32_t) * gs_model.numGeoms); - Vector3 *geom_sizes_gpu = (Vector3 *)cu::allocGPU( - sizeof(Vector3) * gs_model.numGeoms); + int32_t *geom_types_gpu = (int32_t *)cu::allocGPU(sizeof(int32_t) * gs_model.numGeoms); + int32_t *geom_data_ids_gpu = (int32_t *)cu::allocGPU(sizeof(int32_t) * gs_model.numGeoms); + Vector3 *geom_sizes_gpu = (Vector3 *)cu::allocGPU(sizeof(Vector3) * gs_model.numGeoms); float *cam_fovy = (float *)cu::allocGPU(sizeof(float) * gs_model.numCams); float *cam_zfar = (float *)cu::allocGPU(sizeof(float) * gs_model.numCams); float *cam_znear = (float *)cu::allocGPU(sizeof(float) * gs_model.numCams); - REQ_CUDA(cudaMemcpy(geom_types_gpu, gs_model.geomTypes, - sizeof(int32_t) * gs_model.numGeoms, cudaMemcpyHostToDevice)); - REQ_CUDA(cudaMemcpy(geom_data_ids_gpu, gs_model.geomDataIDs, - sizeof(int32_t) * gs_model.numGeoms, cudaMemcpyHostToDevice)); - REQ_CUDA(cudaMemcpy(geom_sizes_gpu, gs_model.geomSizes, - sizeof(Vector3) * gs_model.numGeoms, cudaMemcpyHostToDevice)); - REQ_CUDA(cudaMemcpy(cam_fovy, gs_model.camFovy, - sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice)); - REQ_CUDA(cudaMemcpy(cam_znear, gs_model.camZNear, - sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice)); - REQ_CUDA(cudaMemcpy(cam_zfar, gs_model.camZFar, - sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice)); + REQ_CUDA(cudaMemcpy(geom_types_gpu, gs_model.geomTypes, sizeof(int32_t) * gs_model.numGeoms, cudaMemcpyHostToDevice)); + REQ_CUDA(cudaMemcpy(geom_data_ids_gpu, gs_model.geomDataIDs, sizeof(int32_t) * gs_model.numGeoms, cudaMemcpyHostToDevice)); + REQ_CUDA(cudaMemcpy(geom_sizes_gpu, gs_model.geomSizes, sizeof(Vector3) * gs_model.numGeoms, cudaMemcpyHostToDevice)); + REQ_CUDA(cudaMemcpy(cam_fovy, gs_model.camFovy, sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice)); + REQ_CUDA(cudaMemcpy(cam_znear, gs_model.camZNear, sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice)); + REQ_CUDA(cudaMemcpy(cam_zfar, gs_model.camZFar, sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice)); sim_cfg.geomTypes = geom_types_gpu; sim_cfg.geomDataIDs = geom_data_ids_gpu; @@ -690,9 +679,7 @@ Manager::Impl * Manager::Impl::make( sim_cfg.camZFar = cam_zfar; HeapArray world_inits(mgr_cfg.numWorlds); - - Optional render_cfg = - Optional::none(); + Optional render_cfg = Optional::none(); if (use_rt) { render_cfg = { .renderMode = CudaBatchRenderConfig::RenderMode::RGBD, @@ -703,9 +690,7 @@ Manager::Impl * Manager::Impl::make( }; } - std::vector hideseek_srcs = { - GPU_HIDESEEK_SRC_LIST - }; + std::vector hideseek_srcs = { GPU_HIDESEEK_SRC_LIST }; const char *py_root_env = getenv("MADRONA_ROOT_PATH"); std::filesystem::path root_dir = py_root_env ? py_root_env : std::filesystem::current_path(); std::for_each( @@ -735,8 +720,7 @@ Manager::Impl * Manager::Impl::make( CompileConfig::OptMode::LTO, }, cu_ctx, render_cfg); - Optional raytrace_graph = - Optional::none(); + Optional raytrace_graph = Optional::none(); if (use_rt) { raytrace_graph = gpu_exec.buildRenderGraph(); diff --git a/src/bridge/mgr.hpp b/src/bridge/mgr.hpp index 3ae2a692..a887969d 100644 --- a/src/bridge/mgr.hpp +++ b/src/bridge/mgr.hpp @@ -26,7 +26,6 @@ struct GSModelGeometry { uint32_t *triOffsets; madrona::math::Vector2 *texCoords; int32_t *texCoordOffsets; - uint32_t *texCoordNum; uint32_t numVertices; uint32_t numTris; uint32_t numMeshes; @@ -43,6 +42,7 @@ struct GSModel { madrona::math::Vector4 *geomRGBA; madrona::math::Vector4 *matRGBA; int32_t *matTexIDs; + int32_t *matTexOffsets; uint8_t *texData; int64_t *texOffsets; int32_t *texWidths; @@ -50,6 +50,7 @@ struct GSModel { int32_t *texNChans; uint32_t numGeoms; uint32_t numMats; + uint32_t numMatTextures; uint32_t numTextures; uint32_t numCams; uint32_t numLights; diff --git a/src/importer/gltf.cpp b/src/importer/gltf.cpp index 9e2977f5..1532be5e 100644 --- a/src/importer/gltf.cpp +++ b/src/importer/gltf.cpp @@ -1744,16 +1744,11 @@ static bool gltfImportAssets(LoaderData &loader, } imported.geoData.meshArrays.resize(new_mesh_arrays_start, [](auto *) {}); - imported.geoData.positionArrays.resize(new_vert_arrays_start, - [](auto *) {}); - imported.geoData.normalArrays.resize(new_normal_arrays_start, - [](auto *) {}); - imported.geoData.uvArrays.resize(new_uvs_arrays_start, - [](auto *) {}); - imported.objects.resize(new_objects_start, - [](auto *) {}); - imported.instances.resize(new_instances_start, - [](auto *) {}); + imported.geoData.positionArrays.resize(new_vert_arrays_start, [](auto *) {}); + imported.geoData.normalArrays.resize(new_normal_arrays_start, [](auto *) {}); + imported.geoData.uvArrays.resize(new_uvs_arrays_start, [](auto *) {}); + imported.objects.resize(new_objects_start, [](auto *) {}); + imported.instances.resize(new_instances_start, [](auto *) {}); imported.objects.push_back({ .meshes = Span( @@ -1816,14 +1811,23 @@ static bool gltfImportAssets(LoaderData &loader, int32_t texture_id = material.baseColorIdx; if (texture_id != -1) { texture_id += prev_tex_idx; + imported.materialTextures.push_back(uint32_t(texture_id)); + imported.materials.emplace_back(SourceMaterial { + .color = material.baseColor, + .textureIdx = &imported.materialTextures[imported.materialTextures.size() - 1], + .numTextures = 1, + .roughness = material.roughness, + .metalness = material.metallic, + }); + } else { + imported.materials.emplace_back(SourceMaterial { + .color = material.baseColor, + .textureIdx = nullptr, + .numTextures = 0, + .roughness = material.roughness, + .metalness = material.metallic, + }); } - SourceMaterial s_mat = { - .color = material.baseColor, - .textureIdx = texture_id, - .roughness = material.roughness, - .metalness = material.metallic, - }; - imported.materials.emplace_back(s_mat); } return true; diff --git a/src/render/batch_renderer.cpp b/src/render/batch_renderer.cpp index b5facf1d..23bb9789 100644 --- a/src/render/batch_renderer.cpp +++ b/src/render/batch_renderer.cpp @@ -327,7 +327,7 @@ static PipelineMP<1> makeDrawPipeline(const vk::Device &dev, blend_info.pAttachments = blend_attachments.data(); // Dynamic - std::array dyn_enable { + std::array dyn_enable {{ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR, }; @@ -753,9 +753,7 @@ static DrawCommandPackage makeDrawCommandPackage(vk::Device& dev, }; int64_t buffer_offsets[2]; - int64_t num_draw_bytes = utils::computeBufferOffsets( - buffer_sizes, buffer_offsets, 256); - + int64_t num_draw_bytes = utils::computeBufferOffsets(buffer_sizes, buffer_offsets, 256); vk::LocalBuffer drawBuffer = alloc.makeLocalBuffer(num_draw_bytes).value(); std::array desc_updates; @@ -931,8 +929,7 @@ static void makeBatchFrame(vk::Device& dev, } uint32_t max_num_view = cfg.numWorlds * cfg.maxViewsPerWorld; - HeapArray layered_targets = makeLayeredTargets( - cfg.renderWidth, cfg.renderHeight, max_num_view, dev, alloc); + HeapArray layered_targets = makeLayeredTargets(cfg.renderWidth, cfg.renderHeight, max_num_view, dev, alloc); uint64_t num_pixels = static_cast(max_num_view) * cfg.renderWidth * cfg.renderHeight; { @@ -1146,7 +1143,6 @@ static void issueRasterization(vk::Device &dev, rendering_info.pDepthAttachment = &depth_attach; dev.dt.cmdBeginRenderingKHR(draw_cmd, &rendering_info); - dev.dt.cmdBindPipeline(draw_cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, draw_pipeline.hdls[0]); dev.dt.cmdBindIndexBuffer(draw_cmd, loaded_assets[0].buf.buffer, loaded_assets[0].idxBufferOffset, @@ -1160,13 +1156,10 @@ static void issueRasterization(vk::Device &dev, batch_frame.shadowAssetSet, }; - dev.dt.cmdBindDescriptorSets(draw_cmd, - VK_PIPELINE_BIND_POINT_GRAPHICS, - draw_pipeline.layout, - 0, - draw_descriptors.size(), - draw_descriptors.data(), - 0, nullptr); + dev.dt.cmdBindDescriptorSets( + draw_cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, draw_pipeline.layout, 0, + draw_descriptors.size(), draw_descriptors.data(), 0, nullptr + ); uint32_t max_image_dim_x = std::min(consts::maxTextureDim, consts::maxNumImagesX * target.viewWidth); uint32_t max_num_image_x = max_image_dim_x / target.viewWidth; @@ -1263,7 +1256,7 @@ static void issueDeferred(vk::Device &dev, sizeof(shader::DeferredLightingPushConstBR), &push_const); - std::array draw_descriptors = { + std::array draw_descriptors { batch_frame.targetsSetLighting, batch_frame.viewInstanceSetLighting, pbr_set @@ -1275,14 +1268,10 @@ static void issueDeferred(vk::Device &dev, draw_descriptors.data(), 0, nullptr); - uint32_t num_workgroups_x = utils::divideRoundUp( - render_dims.width, 32_u32); - uint32_t num_workgroups_y = utils::divideRoundUp( - render_dims.height, 32_u32); + uint32_t num_workgroups_x = utils::divideRoundUp(render_dims.width, 32_u32); + uint32_t num_workgroups_y = utils::divideRoundUp(render_dims.height, 32_u32); uint32_t num_workgroups_z = total_num_views; - - dev.dt.cmdDispatch( - draw_cmd, num_workgroups_x, num_workgroups_y, num_workgroups_z); + dev.dt.cmdDispatch(draw_cmd, num_workgroups_x, num_workgroups_y, num_workgroups_z); } static void issueShadowGen(vk::Device &dev, @@ -1325,8 +1314,10 @@ static void issueShadowGen(vk::Device &dev, &push_const); // Descriptor sets - dev.dt.cmdBindDescriptorSets(draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE, - pipeline.layout, 0, 1, &frame.shadowGenSet, 0, nullptr); + dev.dt.cmdBindDescriptorSets( + draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline.layout, 0, 1, &frame.shadowGenSet, 0, nullptr + ); uint32_t num_workgroups_x = utils::divideRoundUp(max_num_views, 256_u32); dev.dt.cmdDispatch(draw_cmd, num_workgroups_x, 1, 1); @@ -1436,27 +1427,21 @@ static void issueShadowDraw(vk::Device &dev, rendering_info.pDepthAttachment = &depth_attach; dev.dt.cmdBeginRenderingKHR(draw_cmd, &rendering_info); + dev.dt.cmdBindPipeline(draw_cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline.hdls[0]); + dev.dt.cmdBindIndexBuffer( + draw_cmd, loaded_assets[0].buf.buffer, loaded_assets[0].idxBufferOffset, VK_INDEX_TYPE_UINT32 + ); - dev.dt.cmdBindPipeline(draw_cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, - pipeline.hdls[0]); - - dev.dt.cmdBindIndexBuffer(draw_cmd, loaded_assets[0].buf.buffer, - loaded_assets[0].idxBufferOffset, - VK_INDEX_TYPE_UINT32); - - std::array draw_descriptors { + std::array draw_descriptors { batch_frame.shadowDrawSet, view_batch.drawBufferSetDraw, asset_set, }; - dev.dt.cmdBindDescriptorSets(draw_cmd, - VK_PIPELINE_BIND_POINT_GRAPHICS, - pipeline.layout, - 0, - draw_descriptors.size(), - draw_descriptors.data(), - 0, nullptr); + dev.dt.cmdBindDescriptorSets( + draw_cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline.layout, 0, + draw_descriptors.size(), draw_descriptors.data(), 0, nullptr + ); uint32_t max_image_dim_x = std::min(consts::maxTextureDim, consts::maxNumImagesX * target.viewWidth); uint32_t max_num_image_x = max_image_dim_x / target.viewWidth; @@ -1651,7 +1636,7 @@ BatchRenderer::Impl::Impl(const Config &cfg, RenderContext &rctx): batchFrames(cfg.numFrames), assetSetPrepare(rctx.asset_set_cull_), assetSetDraw(rctx.asset_set_draw_), - assetSetTextureMat(rctx.asset_set_tex_compute_), + assetSetTextureMat(rctx.asset_set_mat_tex_), assetSetLighting(rctx.asset_batch_lighting_set_), renderExtent { cfg.renderWidth, cfg.renderHeight }, selectedView(0), @@ -1781,11 +1766,10 @@ static void issuePrepareViewsPipeline(vk::Device& dev, (void)num_views; (void)num_processed_batches; - dev.dt.cmdBindPipeline(draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE, - prepare_views.hdls[0]); + dev.dt.cmdBindPipeline(draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE, prepare_views.hdls[0]); { // Dispatch the compute shader - std::array view_gen_descriptors = { + std::array view_gen_descriptors = { frame.viewInstanceSetPrepare, batch.drawBufferSetPrepare, assetSetPrepareView, @@ -1793,11 +1777,13 @@ static void issuePrepareViewsPipeline(vk::Device& dev, rctx.loaded_assets_[0].aabbSet }; - dev.dt.cmdBindDescriptorSets(draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE, - prepare_views.layout, 0, - view_gen_descriptors.size(), - view_gen_descriptors.data(), - 0, nullptr); + dev.dt.cmdBindDescriptorSets( + draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE, + prepare_views.layout, 0, + view_gen_descriptors.size(), + view_gen_descriptors.data(), + 0, nullptr + ); shader::PrepareViewPushConstant view_push_const = { num_views, view_start, num_worlds, num_instances, diff --git a/src/render/render_ctx.cpp b/src/render/render_ctx.cpp index fc2eda24..d190e842 100644 --- a/src/render/render_ctx.cpp +++ b/src/render/render_ctx.cpp @@ -77,8 +77,7 @@ void initCommonDrawPipelineInfo( VkPipelineRasterizationStateCreateInfo &raster_info) { // Disable auto vertex assembly - vert_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + vert_info.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; vert_info.pNext = nullptr; vert_info.flags = 0; vert_info.vertexBindingDescriptionCount = 0; @@ -87,30 +86,26 @@ void initCommonDrawPipelineInfo( vert_info.pVertexAttributeDescriptions = nullptr; // Assembly (standard tri indices) - input_assembly_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + input_assembly_info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; input_assembly_info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; input_assembly_info.primitiveRestartEnable = VK_FALSE; // Viewport (fully dynamic) - viewport_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewport_info.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; viewport_info.viewportCount = 1; viewport_info.pViewports = nullptr; viewport_info.scissorCount = 1; viewport_info.pScissors = nullptr; // Multisample - multisample_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + multisample_info.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; multisample_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; multisample_info.sampleShadingEnable = VK_FALSE; multisample_info.alphaToCoverageEnable = VK_FALSE; multisample_info.alphaToOneEnable = VK_FALSE; // Rasterization - raster_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + raster_info.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; raster_info.depthClampEnable = VK_FALSE; raster_info.rasterizerDiscardEnable = VK_FALSE; raster_info.polygonMode = VK_POLYGON_MODE_FILL; @@ -285,12 +280,12 @@ static PipelineShaders makeDrawShaders( ShaderCompiler compiler; SPIRVShader vert_spirv = compiler.compileHLSLFileToSPV( - shader_path.c_str(), {}, {}, - { "vert", ShaderStage::Vertex }); + shader_path.c_str(), {}, {}, { "vert", ShaderStage::Vertex } + ); SPIRVShader frag_spirv = compiler.compileHLSLFileToSPV( - shader_path.c_str(), {}, {}, - { "frag", ShaderStage::Fragment }); + shader_path.c_str(), {}, {}, { "frag", ShaderStage::Fragment } + ); #if 0 {0, 2, repeat_sampler, 1, 0}, @@ -349,9 +344,7 @@ static Pipeline<1> makeDrawPipeline(const Device &dev, VkSampler clamp_sampler, uint32_t num_frames) { - auto shaders = - makeDrawShaders(dev, repeat_sampler, clamp_sampler); - + auto shaders = makeDrawShaders(dev, repeat_sampler, clamp_sampler); VkPipelineVertexInputStateCreateInfo vert_info {}; VkPipelineInputAssemblyStateCreateInfo input_assembly_info {}; VkPipelineViewportStateCreateInfo viewport_info {}; @@ -363,8 +356,7 @@ static Pipeline<1> makeDrawPipeline(const Device &dev, // Depth/Stencil VkPipelineDepthStencilStateCreateInfo depth_info {}; - depth_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + depth_info.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; depth_info.depthTestEnable = VK_TRUE; depth_info.depthWriteEnable = VK_TRUE; depth_info.depthCompareOp = VK_COMPARE_OP_GREATER_OR_EQUAL; @@ -423,16 +415,13 @@ static Pipeline<1> makeDrawPipeline(const Device &dev, gfx_layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; gfx_layout_info.pNext = nullptr; gfx_layout_info.flags = 0; - gfx_layout_info.setLayoutCount = - static_cast(draw_desc_layouts.size()); + gfx_layout_info.setLayoutCount = static_cast(draw_desc_layouts.size()); gfx_layout_info.pSetLayouts = draw_desc_layouts.data(); gfx_layout_info.pushConstantRangeCount = 1; gfx_layout_info.pPushConstantRanges = &push_const; VkPipelineLayout draw_layout; - REQ_VK(dev.dt.createPipelineLayout(dev.hdl, &gfx_layout_info, nullptr, - &draw_layout)); - + REQ_VK(dev.dt.createPipelineLayout(dev.hdl, &gfx_layout_info, nullptr, &draw_layout)); array gfx_stages {{ { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, @@ -755,12 +744,8 @@ static EngineInterop setupEngineInterop(Device &dev, view_offsets_base = view_offsets_cpu->ptr; } else { #ifdef MADRONA_VK_CUDA_SUPPORT - view_offsets_gpu = alloc.makeDedicatedBuffer( - num_offsets_bytes, false, true); - - view_offsets_cuda.emplace(dev, view_offsets_gpu->mem, - num_offsets_bytes); - + view_offsets_gpu = alloc.makeDedicatedBuffer(num_offsets_bytes, false, true); + view_offsets_cuda.emplace(dev, view_offsets_gpu->mem, num_offsets_bytes); view_offsets_hdl = view_offsets_gpu->buf.buffer; view_offsets_base = (char *)view_offsets_cuda->getDevicePointer(); #endif @@ -776,12 +761,8 @@ static EngineInterop setupEngineInterop(Device &dev, light_offsets_base = light_offsets_cpu->ptr; } else { #ifdef MADRONA_VK_CUDA_SUPPORT - light_offsets_gpu = alloc.makeDedicatedBuffer( - num_offsets_bytes, false, true); - - light_offsets_cuda.emplace(dev, light_offsets_gpu->mem, - num_offsets_bytes); - + light_offsets_gpu = alloc.makeDedicatedBuffer(num_offsets_bytes, false, true); + light_offsets_cuda.emplace(dev, light_offsets_gpu->mem, num_offsets_bytes); light_offsets_hdl = light_offsets_gpu->buf.buffer; light_offsets_base = (char *)light_offsets_cuda->getDevicePointer(); #endif @@ -807,15 +788,10 @@ static EngineInterop setupEngineInterop(Device &dev, voxel_buffer_hdl = voxel_cpu->buffer; } else { #ifdef MADRONA_VK_CUDA_SUPPORT - voxel_gpu = alloc.makeDedicatedBuffer( - staging_size, false, true); - - voxel_cuda.emplace( - dev, voxel_gpu->mem, staging_size); - + voxel_gpu = alloc.makeDedicatedBuffer(staging_size, false, true); + voxel_cuda.emplace(dev, voxel_gpu->mem, staging_size); voxel_buffer_hdl = voxel_gpu->buf.buffer; - voxel_buffer_ptr = num_voxels ? - (uint32_t *)voxel_cuda->getDevicePointer() : nullptr; + voxel_buffer_ptr = num_voxels ? (uint32_t *)voxel_cuda->getDevicePointer() : nullptr; #endif } @@ -842,8 +818,7 @@ static EngineInterop setupEngineInterop(Device &dev, total_num_lights_cpu_inc->store_release(0); } else { #ifdef MADRONA_VK_CUDA_SUPPORT - total_num_views_readback = (uint32_t *)cu::allocReadback( - 3*sizeof(uint32_t)); + total_num_views_readback = (uint32_t *)cu::allocReadback(3*sizeof(uint32_t)); total_num_instances_readback = total_num_views_readback + 1; total_num_lights_readback = total_num_instances_readback + 1; #endif @@ -1294,56 +1269,51 @@ static Sky loadSky(const vk::Device &dev, MemoryAllocator &alloc, VkQueue queue) } RenderContext::RenderContext( - APIBackend *render_backend, - GPUDevice *render_dev, - const RenderManager::Config &cfg) - : backend(*static_cast(render_backend)), - dev(static_cast(*render_dev)), - alloc(dev, backend), - renderQueue(makeGFXQueue(dev, 0)), - br_width_(cfg.agentViewWidth), - br_height_(cfg.agentViewHeight), - pipelineCache(getPipelineCache(dev)), - repeatSampler( - makeImmutableSampler(dev, VK_SAMPLER_ADDRESS_MODE_REPEAT)), - clampSampler( - makeImmutableSampler(dev, VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE)), - renderPass(makeRenderPass( - dev, VK_FORMAT_R8G8B8A8_UNORM, InternalConfig::gbufferFormat, - InternalConfig::gbufferFormat, InternalConfig::depthFormat)), - shadowPass(makeShadowRenderPass( - dev, InternalConfig::varianceFormat, InternalConfig::depthFormat)), - instanceCull(makeCullPipeline( - dev, pipelineCache, InternalConfig::numFrames)), - objectDraw(makeDrawPipeline(dev, pipelineCache, - renderPass, repeatSampler, clampSampler, - InternalConfig::numFrames)), - asset_desc_pool_cull_(dev, instanceCull.shaders, 1, 1), - asset_desc_pool_draw_(dev, objectDraw.shaders, 1, 1), - asset_desc_pool_mat_tx_(dev, objectDraw.shaders, 2, 1), - asset_set_cull_(asset_desc_pool_cull_.makeSet()), - asset_set_draw_(asset_desc_pool_draw_.makeSet()), - asset_set_mat_tex_(asset_desc_pool_mat_tx_.makeSet()), - load_cmd_pool_(makeCmdPool(dev, dev.gfxQF)), - load_cmd_(makeCmdBuffer(dev, load_cmd_pool_)), - load_fence_(makeFence(dev)), - engine_interop_(setupEngineInterop( - dev, alloc, cfg.execMode == ExecMode::CUDA, cfg.numWorlds, - cfg.maxViewsPerWorld, cfg.maxInstancesPerWorld, - cfg.maxLightsPerWorld, - br_width_, br_height_, cfg.voxelCfg)), - lights_(InternalConfig::maxLights), - loaded_assets_(0), - sky_(loadSky(dev, alloc, renderQueue)), - material_textures_(0), - voxel_config_(cfg.voxelCfg), - num_worlds_(cfg.numWorlds), - gpu_input_(cfg.execMode == ExecMode::CUDA) + APIBackend *render_backend, + GPUDevice *render_dev, + const RenderManager::Config &cfg +): backend(*static_cast(render_backend)), + dev(static_cast(*render_dev)), + alloc(dev, backend), + renderQueue(makeGFXQueue(dev, 0)), + br_width_(cfg.agentViewWidth), + br_height_(cfg.agentViewHeight), + pipelineCache(getPipelineCache(dev)), + repeatSampler(makeImmutableSampler(dev, VK_SAMPLER_ADDRESS_MODE_REPEAT)), + clampSampler(makeImmutableSampler(dev, VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE)), + renderPass(makeRenderPass( + dev, VK_FORMAT_R8G8B8A8_UNORM, InternalConfig::gbufferFormat, + InternalConfig::gbufferFormat, InternalConfig::depthFormat)), + shadowPass(makeShadowRenderPass( + dev, InternalConfig::varianceFormat, InternalConfig::depthFormat)), + instanceCull(makeCullPipeline(dev, pipelineCache, InternalConfig::numFrames)), + objectDraw(makeDrawPipeline(dev, pipelineCache, renderPass, repeatSampler, clampSampler, InternalConfig::numFrames)), + asset_desc_pool_cull_(dev, instanceCull.shaders, 1, 1), + asset_desc_pool_draw_(dev, objectDraw.shaders, 1, 1), + asset_desc_pool_mat_tx_(dev, objectDraw.shaders, 2, 1), + asset_set_cull_(asset_desc_pool_cull_.makeSet()), + asset_set_draw_(asset_desc_pool_draw_.makeSet()), + asset_set_mat_tex_(asset_desc_pool_mat_tx_.makeSet()), + load_cmd_pool_(makeCmdPool(dev, dev.gfxQF)), + load_cmd_(makeCmdBuffer(dev, load_cmd_pool_)), + load_fence_(makeFence(dev)), + engine_interop_(setupEngineInterop( + dev, alloc, cfg.execMode == ExecMode::CUDA, cfg.numWorlds, + cfg.maxViewsPerWorld, cfg.maxInstancesPerWorld, + cfg.maxLightsPerWorld, + br_width_, br_height_, cfg.voxelCfg)), + lights_(InternalConfig::maxLights), + loaded_assets_(0), + sky_(loadSky(dev, alloc, renderQueue)), + material_textures_(0), + voxel_config_(cfg.voxelCfg), + num_worlds_(cfg.numWorlds), + gpu_input_(cfg.execMode == ExecMode::CUDA) { { VkDescriptorPoolSize pool_sizes[] = { { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 25 }, - { VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, InternalConfig::maxTextures*2 }, + { VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, InternalConfig::maxTextures * 2 }, { VK_DESCRIPTOR_TYPE_SAMPLER, 1 } }; @@ -1353,8 +1323,7 @@ RenderContext::RenderContext( pool_info.maxSets = 10 + InternalConfig::maxTextures + 1; pool_info.poolSizeCount = 3; pool_info.pPoolSizes = pool_sizes; - REQ_VK(dev.dt.createDescriptorPool(dev.hdl, - &pool_info, nullptr, &asset_pool_)); + REQ_VK(dev.dt.createDescriptorPool(dev.hdl, &pool_info, nullptr, &asset_pool_)); } { @@ -1488,16 +1457,22 @@ RenderContext::RenderContext( .binding = 0, .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, .descriptorCount = InternalConfig::maxTextures, - .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, + .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, .pImmutableSamplers = nullptr, }, - { .binding = 1, .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER, .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, + .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, .pImmutableSamplers = &repeatSampler + }, + { + .binding = 2, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, + .pImmutableSamplers = nullptr, } }; @@ -1505,7 +1480,7 @@ RenderContext::RenderContext( .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, .pNext = &flag_info, .flags = 0, - .bindingCount = 2, + .bindingCount = 3, .pBindings = bindings }; @@ -1518,12 +1493,8 @@ RenderContext::RenderContext( .pNext = nullptr, .descriptorPool = asset_pool_, .descriptorSetCount = 1, - .pSetLayouts = &asset_tex_layout_ + .pSetLayouts = &asset_batch_lighting_layout_ }; - - dev.dt.allocateDescriptorSets(dev.hdl, &alloc_info, &asset_set_tex_compute_); - - alloc_info.pSetLayouts = &asset_batch_lighting_layout_; dev.dt.allocateDescriptorSets(dev.hdl, &alloc_info, &asset_batch_lighting_set_); } @@ -1564,16 +1535,16 @@ RenderContext::~RenderContext() dev.dt.destroyImageView(dev.hdl, sky_.mieView, nullptr); dev.dt.destroyImageView(dev.hdl, sky_.scatteringView, nullptr); - dev.dt.freeMemory(dev.hdl, sky_.transmittanceBacking, nullptr); - dev.dt.freeMemory(dev.hdl, sky_.irradianceBacking, nullptr); - dev.dt.freeMemory(dev.hdl, sky_.mieBacking, nullptr); - dev.dt.freeMemory(dev.hdl, sky_.scatteringBacking, nullptr); - dev.dt.destroyImage(dev.hdl, sky_.transmittance.image, nullptr); dev.dt.destroyImage(dev.hdl, sky_.irradiance.image, nullptr); dev.dt.destroyImage(dev.hdl, sky_.singleMieScattering.image, nullptr); dev.dt.destroyImage(dev.hdl, sky_.scattering.image, nullptr); + dev.dt.freeMemory(dev.hdl, sky_.transmittanceBacking, nullptr); + dev.dt.freeMemory(dev.hdl, sky_.irradianceBacking, nullptr); + dev.dt.freeMemory(dev.hdl, sky_.mieBacking, nullptr); + dev.dt.freeMemory(dev.hdl, sky_.scatteringBacking, nullptr); + dev.dt.destroyFence(dev.hdl, load_fence_, nullptr); dev.dt.destroyCommandPool(dev.hdl, load_cmd_pool_, nullptr); @@ -1774,42 +1745,32 @@ static DynArray loadTextures( dev.dt.beginCommandBuffer(cmdbuf, &begin_info); - for (const imp::SourceTexture &tx : textures) - { + for (const imp::SourceTexture &tx : textures) { if (tx.format == imp::SourceTextureFormat::BC7) { void *pixel_data = tx.data; uint32_t pixel_data_size = tx.numBytes; - - uint32_t width = tx.width, - height = tx.height; - - auto [texture, texture_reqs] = alloc.makeTexture2D( - width, height, 1, VK_FORMAT_BC7_UNORM_BLOCK); + uint32_t width = tx.width; + uint32_t height = tx.height; + auto [texture, texture_reqs] = alloc.makeTexture2D(width, height, 1, VK_FORMAT_BC7_UNORM_BLOCK); HostBuffer texture_hb_staging = alloc.makeStagingBuffer(texture_reqs.size); memcpy(texture_hb_staging.ptr, pixel_data, pixel_data_size); texture_hb_staging.flush(dev); - std::optional texture_backing = alloc.alloc(texture_reqs.size); - assert(texture_backing.has_value()); - dev.dt.bindImageMemory(dev.hdl, texture.image, texture_backing.value(), 0); VkImageMemoryBarrier copy_prepare { VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, - nullptr, - 0, - VK_ACCESS_MEMORY_WRITE_BIT, - VK_IMAGE_LAYOUT_UNDEFINED, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - VK_QUEUE_FAMILY_IGNORED, - VK_QUEUE_FAMILY_IGNORED, - texture.image, - { - VK_IMAGE_ASPECT_COLOR_BIT, - 0, 1, 0, 1 - }, + nullptr, + 0, + VK_ACCESS_MEMORY_WRITE_BIT, + VK_IMAGE_LAYOUT_UNDEFINED, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED, + VK_QUEUE_FAMILY_IGNORED, + texture.image, + { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }, }; dev.dt.cmdPipelineBarrier(cmdbuf, @@ -1836,18 +1797,15 @@ static DynArray loadTextures( VkImageMemoryBarrier finish_prepare { VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, - nullptr, - VK_ACCESS_MEMORY_WRITE_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, - VK_QUEUE_FAMILY_IGNORED, - VK_QUEUE_FAMILY_IGNORED, - texture.image, - { - VK_IMAGE_ASPECT_COLOR_BIT, - 0, 1, 0, 1 - }, + nullptr, + VK_ACCESS_MEMORY_WRITE_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED, + VK_QUEUE_FAMILY_IGNORED, + texture.image, + { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }, }; @@ -1881,23 +1839,16 @@ static DynArray loadTextures( uint32_t height = tx.height; uint32_t mip_levels = std::max(1, (int32_t)std::floor(std::log2(std::max(width, height)))); - - auto [texture, texture_reqs] = alloc.makeTexture2D( - width, height, mip_levels, VK_FORMAT_R8G8B8A8_SRGB); + auto [texture, texture_reqs] = alloc.makeTexture2D(width, height, mip_levels, VK_FORMAT_R8G8B8A8_SRGB); HostBuffer texture_hb_staging = alloc.makeStagingBuffer(texture_reqs.size); memcpy(texture_hb_staging.ptr, pixels, width * height * 4 * sizeof(char)); texture_hb_staging.flush(dev); - std::optional texture_backing = alloc.alloc(texture_reqs.size); - assert(texture_backing.has_value()); - dev.dt.bindImageMemory(dev.hdl, texture.image, texture_backing.value(), 0); - copyBufferToImage(dev, cmdbuf, texture_hb_staging, texture.image, width, height, mip_levels); host_buffers.push_back(std::move(texture_hb_staging)); - generateMipmaps(dev, cmdbuf, texture.image, width, height, mip_levels); VkImageViewCreateInfo view_info {}; @@ -1914,8 +1865,6 @@ static DynArray loadTextures( view_info.image = texture.image; view_info.format = VK_FORMAT_R8G8B8A8_SRGB; REQ_VK(dev.dt.createImageView(dev.hdl, &view_info, nullptr, &view)); - - dst_textures.emplace_back(std::move(texture), view, texture_backing.value()); } } @@ -1957,6 +1906,9 @@ CountT RenderContext::loadObjects(Span src_objs, int64_t num_total_vertices = 0; int64_t num_total_indices = 0; int64_t num_total_meshes = 0; + int64_t num_total_objs = src_objs.size(); + int64_t num_total_materials = src_mats.size(); + int64_t num_total_textures = 0; for (const SourceObject &obj : src_objs) { num_total_meshes += obj.meshes.size(); @@ -1968,41 +1920,32 @@ CountT RenderContext::loadObjects(Span src_objs, num_total_vertices += mesh.numVertices; num_total_indices += mesh.numFaces * 3; + num_total_textures += src_mats[mesh.materialIDX].numTextures; } } - int64_t num_total_objs = src_objs.size(); - int64_t buffer_offsets[5]; - int64_t buffer_sizes[6] = { + int64_t buffer_offsets[6]; + int64_t buffer_sizes[7] = { (int64_t)sizeof(ObjectData) * num_total_objs, (int64_t)sizeof(MeshData) * num_total_meshes, (int64_t)sizeof(PackedVertex) * num_total_vertices, (int64_t)sizeof(uint32_t) * num_total_indices, - (int64_t)sizeof(MaterialDataShader) * src_mats.size(), - (int64_t)sizeof(ShaderAABB) * num_total_objs + (int64_t)sizeof(MaterialDataShader) * num_total_materials, + (int64_t)sizeof(ShaderAABB) * num_total_objs, + (int64_t)sizeof(uint32_t) * num_total_textures }; - int64_t num_asset_bytes = utils::computeBufferOffsets( - buffer_sizes, buffer_offsets, 256); - + int64_t num_asset_bytes = utils::computeBufferOffsets(buffer_sizes, buffer_offsets, 256); HostBuffer staging = alloc.makeStagingBuffer(num_asset_bytes); char *staging_ptr = (char *)staging.ptr; ObjectData *obj_ptr = (ObjectData *)staging_ptr; - MeshData *mesh_ptr = - (MeshData *)(staging_ptr + buffer_offsets[0]); - PackedVertex *vertex_ptr = - (PackedVertex *)(staging_ptr + buffer_offsets[1]); - uint32_t *indices_ptr = - (uint32_t *)(staging_ptr + buffer_offsets[2]); - MaterialDataShader *materials_ptr = - (MaterialDataShader *)(staging_ptr + buffer_offsets[3]); - ShaderAABB *aabbs_ptr = - (ShaderAABB *)(staging_ptr + buffer_offsets[4]); - - int32_t mesh_offset = 0; - int32_t vertex_offset = 0; - int32_t index_offset = 0; + MeshData *mesh_ptr = (MeshData *)(staging_ptr + buffer_offsets[0]); + PackedVertex *vertex_ptr = (PackedVertex *)(staging_ptr + buffer_offsets[1]); + uint32_t *indices_ptr = (uint32_t *)(staging_ptr + buffer_offsets[2]); + MaterialDataShader *materials_ptr = (MaterialDataShader *)(staging_ptr + buffer_offsets[3]); + ShaderAABB *aabbs_ptr = (ShaderAABB *)(staging_ptr + buffer_offsets[4]); + uint32_t *mat_textures_ptr = (uint32_t *)(staging_ptr + buffer_offsets[5]); auto packHalf2x16 = [](const Vector2 &v) { #if defined(MADRONA_MSVC) @@ -2020,37 +1963,35 @@ CountT RenderContext::loadObjects(Span src_objs, y_half = v.y; #endif - return uint32_t(std::bit_cast(y_half)) << 16 | - uint32_t(std::bit_cast(x_half)); + return uint32_t(std::bit_cast(y_half)) << 16 | uint32_t(std::bit_cast(x_half)); }; + int32_t obj_offset = 0; + int32_t mesh_offset = 0; + int32_t vertex_offset = 0; + int32_t index_offset = 0; for (const SourceObject &obj : src_objs) { - *obj_ptr++ = ObjectData { - .meshOffset = mesh_offset, - .numMeshes = (int32_t)obj.meshes.size(), - }; + ObjectData *obj_data = obj_ptr + (obj_offset++); + obj_data->meshOffset = mesh_offset; + obj_data->numMeshes = (int32_t)obj.meshes.size(); for (const SourceMesh &mesh : obj.meshes) { uint32_t material_idx = mesh.materialIDX; - int32_t num_mesh_verts = (int32_t)mesh.numVertices; int32_t num_mesh_indices = (int32_t)mesh.numFaces * 3; - MeshData mesh_data = MeshData {}; - mesh_data.vertexOffset = vertex_offset; - mesh_data.numVertices = num_mesh_verts; - mesh_data.indexOffset = index_offset; - mesh_data.numIndices = num_mesh_indices; - mesh_data.materialIndex = (int32_t)material_idx; - - mesh_ptr[mesh_offset++] = mesh_data; + MeshData *mesh_data = mesh_ptr + (mesh_offset++); + mesh_data->vertexOffset = vertex_offset; + mesh_data->numVertices = num_mesh_verts; + mesh_data->indexOffset = index_offset; + mesh_data->numIndices = num_mesh_indices; + mesh_data->materialIndex = (int32_t)material_idx; // Compute new normals HeapArray new_normals(num_mesh_verts); memset(new_normals.data(), 0, num_mesh_verts * sizeof(Vector3)); - for (CountT face_idx = 0; face_idx < (CountT)mesh.numFaces; - face_idx++) { + for (CountT face_idx = 0; face_idx < (CountT)mesh.numFaces; face_idx++) { CountT base_idx = face_idx * 3; uint32_t i0 = mesh.indices[base_idx]; uint32_t i1 = mesh.indices[base_idx + 1]; @@ -2062,7 +2003,6 @@ CountT RenderContext::loadObjects(Span src_objs, Vector3 e0 = v1 - v0; Vector3 e1 = v2 - v0; - Vector3 face_normal = cross(e0, e1); new_normals[i0] += face_normal; // align with pyrender @@ -2070,98 +2010,70 @@ CountT RenderContext::loadObjects(Span src_objs, new_normals[i2] += face_normal; } - for (int64_t vert_idx = 0; vert_idx < num_mesh_verts; - vert_idx++) { - if (new_normals[vert_idx].length() == 0.f) { - new_normals[vert_idx] = math::up; + for (CountT vert_idx = 0; vert_idx < num_mesh_verts; vert_idx++) { + Vector3 pos = mesh.positions[vert_idx]; + + // set normals + Vector3 normal = new_normals[vert_idx]; + if (normal.length() == 0.f) { + normal = math::up; } else { - new_normals[vert_idx] = normalize(new_normals[vert_idx]); + normal = normalize(normal); } - } - - for (int32_t i = 0; i < num_mesh_verts; i++) { - Vector3 pos = mesh.positions[i]; - Vector3 normal = new_normals[i]; - if(mesh.normals) { - mesh.normals[i] = normal; + new_normals[vert_idx] = normal; + if (mesh.normals) { + mesh.normals[vert_idx] = normal; } + // set tangent and sign Vector3 a, b; normal.frame(&a, &b); - Vector4 tangent_sign = { - a.x, - a.y, - a.z, - 1.f, - }; - if(mesh.tangentAndSigns) { - mesh.tangentAndSigns[i] = tangent_sign; + Vector4 tangent_sign = { a.x, a.y, a.z, 1.f }; + if (mesh.tangentAndSigns) { + mesh.tangentAndSigns[vert_idx] = tangent_sign; } - Vector2 uv = mesh.uvs ? mesh.uvs[i] : Vector2 { 0, 0 }; - - Vector3 encoded_normal_tangent = - encodeNormalTangent(normal, tangent_sign); + Vector2 uv = mesh.uvs ? mesh.uvs[vert_idx] : Vector2 { 0, 0 }; // Encode UVs into a uint32 (how bad will this look on small images?? // Let's see I guess. - float encoded_uvs = std::bit_cast(packHalf2x16(uv)); - - vertex_ptr[vertex_offset++] = PackedVertex { - Vector4 { - pos.x, - pos.y, - pos.z, - encoded_normal_tangent.x, - }, - Vector4 { - encoded_normal_tangent.y, - encoded_normal_tangent.z, - encoded_uvs, - 0 - }, - }; + Vector3 enc_nt = encodeNormalTangent(normal, tangent_sign); + float enc_uvs = std::bit_cast(packHalf2x16(uv)); + PackedVertex *vertex_data = vertex_ptr + (vertex_offset++); + vertex_data->data[0] = Vector4 { pos.x, pos.y, pos.z, enc_nt.x }; + vertex_data->data[1] = Vector4 { enc_nt.y, enc_nt.z, enc_uv, 0 }; } - memcpy(indices_ptr + index_offset, - mesh.indices, sizeof(uint32_t) * num_mesh_indices); - + memcpy(indices_ptr + index_offset, mesh.indices, sizeof(uint32_t) * num_mesh_indices); index_offset += num_mesh_indices; } } - uint32_t mat_idx = 0; + int32_t mat_offset = 0; + int32_t mat_texture_offset = 0; for (const SourceMaterial &mat : src_mats) { - materials_ptr[mat_idx].color = mat.color; - materials_ptr[mat_idx].roughness = mat.roughness; - materials_ptr[mat_idx].metalness = mat.metalness; - materials_ptr[mat_idx++].textureIdx = mat.textureIdx; + int32_t num_mat_textures = (int32_t)mat.numTextures; + MaterialData *mat_data = materials_ptr + (mat_offset++); + mat_data->color = mat.color; + mat_data->roughness = mat.roughness; + mat_data->metalness = mat.metalness; + mat_data->textureOffset = mat_texture_offset; + mat_data->numTextures = num_mat_textures; + memcpy(mat_textures_ptr + mat_texture_offset, mat.textureIdx, sizeof(uint32_t) * num_mat_textures); + mat_texture_offset += num_mat_textures; } + int32_t aabb_offset = 0; math::AABB *aabbs_src = AssetProcessor::makeAABBs(src_objs); - - ShaderAABB *shader_aabbs_src = (ShaderAABB *)malloc( - sizeof(ShaderAABB) * src_objs.size()); - - for (int i = 0; i < src_objs.size(); ++i) { - math::AABB *src = aabbs_src + i; - ShaderAABB *current = shader_aabbs_src + i; - - current->data[0].x = src->pMin.x; - current->data[0].y = src->pMin.y; - current->data[0].z = src->pMin.z; - - current->data[0].w = src->pMax.x; - current->data[1].x = src->pMax.y; - current->data[1].y = src->pMax.z; + for (int i = 0; i < num_total_objs; ++i) { + math::AABB *aabb = aabbs_src + i; + ShaderAABB *shader_aabb = aabbs_ptr + (aabb_offset++); + shader_aabb->data[0] = Vector4 { aabb->pMin.x, aabb->pMin.y, aabb->pMin.z, aabb->pMax.x }; + shader_aabb->data[1] = Vector4 { aabb->pMax.y, aabb->pMax.z, 0.f, 0.f }; } - - memcpy(aabbs_ptr, shader_aabbs_src, sizeof(ShaderAABB) * src_objs.size()); - free(aabbs_src); staging.flush(dev); - LocalBuffer asset_buffer = *alloc.makeLocalBuffer(num_asset_bytes); GPURunUtil gpu_run { load_cmd_pool_, @@ -2178,9 +2090,7 @@ CountT RenderContext::loadObjects(Span src_objs, .size = (VkDeviceSize)num_asset_bytes, }; - dev.dt.cmdCopyBuffer(load_cmd_, staging.buffer, asset_buffer.buffer, - 1, &buffer_copy); - + dev.dt.cmdCopyBuffer(load_cmd_, staging.buffer, asset_buffer.buffer, 1, &buffer_copy); gpu_run.submit(dev); @@ -2248,7 +2158,7 @@ CountT RenderContext::loadObjects(Span src_objs, VkDescriptorBufferInfo index_set_info; index_set_info.buffer = asset_buffer.buffer; index_set_info.offset = buffer_offsets[2]; - index_set_info.range = buffer_offsets[3] - buffer_offsets[2]; + index_set_info.range = buffer_sizes[3]; desc_updates.push_back({}); DescHelper::storage(desc_updates[4], index_buffer_set, &index_set_info, 0); @@ -2274,21 +2184,25 @@ CountT RenderContext::loadObjects(Span src_objs, material_textures_ = loadTextures(dev, alloc, renderQueue, textures); } - DynArray tx_infos(material_textures_.size()); - for (auto &tx : material_textures_) { - tx_infos.push_back({ - VK_NULL_HANDLE, - tx.view, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL - }); - } - if (material_textures_.size()) { + DynArray tx_infos(material_textures_.size()); + for (auto &tx : material_textures_) { + tx_infos.push_back({ + .sampler = VK_NULL_HANDLE, + .imageView = tx.view, + .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL + }); + } desc_updates.push_back({}); DescHelper::textures(desc_updates[9], asset_set_mat_tex_, tx_infos.data(), tx_infos.size(), 0); + + VkDescriptorBufferInfo mat_tx_info; + mat_tx_info.buffer = asset_buffer.buffer; + mat_tx_info.offset = buffer_offsets[5]; + mat_tx_info.range = buffer_sizes[6]; desc_updates.push_back({}); - DescHelper::textures(desc_updates[10], asset_set_tex_compute_, tx_infos.data(), tx_infos.size(), 0); + DescHelper::storage(desc_updates[10], asset_set_mat_tex_, &mat_tx_info, 2); } DescHelper::update(dev, desc_updates.data(), desc_updates.size()); diff --git a/src/render/render_ctx.hpp b/src/render/render_ctx.hpp index 8ad81310..b745ad4c 100644 --- a/src/render/render_ctx.hpp +++ b/src/render/render_ctx.hpp @@ -76,7 +76,6 @@ struct RenderContext { VkDescriptorSetLayout asset_batch_draw_layout_; VkDescriptorPool asset_pool_; - VkDescriptorSet asset_set_tex_compute_; VkDescriptorSet asset_batch_lighting_set_; // This descriptor set contains information about the sky diff --git a/src/render/shaders/batch_draw_depth.hlsl b/src/render/shaders/batch_draw_depth.hlsl index 70b9279c..bdd210e1 100644 --- a/src/render/shaders/batch_draw_depth.hlsl +++ b/src/render/shaders/batch_draw_depth.hlsl @@ -47,11 +47,8 @@ float4 vert(in uint vid : SV_VertexID, Vertex vert = unpackVertex(vertexDataBuffer[vid]); uint instance_id = draw_data.instanceID; - PerspectiveCameraData view_data = - unpackViewData(viewDataBuffer[draw_data.viewID]); - - EngineInstanceData instance_data = unpackEngineInstanceData( - engineInstanceBuffer[instance_id]); + PerspectiveCameraData view_data = unpackViewData(viewDataBuffer[draw_data.viewID]); + EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[instance_id]); float3 to_view_translation; float4 to_view_rotation; @@ -96,13 +93,9 @@ struct PixelOutput { }; [shader("pixel")] -PixelOutput frag(in V2F v2f, - in uint prim_id : SV_PrimitiveID) +PixelOutput frag(in V2F v2f, in uint prim_id : SV_PrimitiveID) { PixelOutput output; - - output.depthOut = length(v2f.vsCoord) + - min(0.0, abs(materialBuffer[0].color.x)); - + output.depthOut = length(v2f.vsCoord) + min(0.0, abs(materialBuffer[0].color.x)); return output; } diff --git a/src/render/shaders/batch_draw_rgb.hlsl b/src/render/shaders/batch_draw_rgb.hlsl index 540753d5..377d60f5 100644 --- a/src/render/shaders/batch_draw_rgb.hlsl +++ b/src/render/shaders/batch_draw_rgb.hlsl @@ -49,6 +49,9 @@ Texture2D materialTexturesArray[]; [[vk::binding(1, 3)]] SamplerState linearSampler; +[[vk::binding(2, 3)]] +StructuredBuffer materialTexturesIndices; + [[vk::binding(0, 4)]] Texture2D shadowMapTextures[]; @@ -61,11 +64,10 @@ struct V2F { [[vk::location(1)]] float3 worldPos : TEXCOORD0; [[vk::location(2)]] float2 uv : TEXCOORD1; [[vk::location(3)]] int materialIdx : TEXCOORD2; - [[vk::location(4)]] uint color : TEXCOORD3; - [[vk::location(5)]] float3 worldNormal : TEXCOORD4; - [[vk::location(6)]] uint worldIdx : TEXCOORD5; - [[vk::location(7)]] uint viewIdx : TEXCOORD6; - [[vk::location(8)]] uint objectIdx : TEXCOORD7; + [[vk::location(4)]] float3 worldNormal : TEXCOORD3; + [[vk::location(5)]] uint worldIdx : TEXCOORD4; + [[vk::location(6)]] uint viewIdx : TEXCOORD5; + [[vk::location(7)]] uint objectIdx : TEXCOORD6; }; @@ -78,22 +80,18 @@ void vert(in uint vid : SV_VertexID, Vertex vert = unpackVertex(vertexDataBuffer[vid]); uint instance_id = draw_data.instanceID; - - PerspectiveCameraData view_data = - unpackViewData(viewDataBuffer[draw_data.viewID]); - - EngineInstanceData instance_data = unpackEngineInstanceData( - engineInstanceBuffer[instance_id]); + PerspectiveCameraData view_data = unpackViewData(viewDataBuffer[draw_data.viewID]); + EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[instance_id]); float3 to_view_translation; float4 to_view_rotation; - computeCompositeTransform(instance_data.position, instance_data.rotation, + computeCompositeTransform( + instance_data.position, instance_data.rotation, view_data.pos, view_data.rot, - to_view_translation, to_view_rotation); + to_view_translation, to_view_rotation + ); - float3 view_pos = - rotateVec(to_view_rotation, instance_data.scale * vert.position) + - to_view_translation; + float3 view_pos = rotateVec(to_view_rotation, instance_data.scale * vert.position) + to_view_translation; float z_far = view_data.zFar; float z_near = view_data.zNear; float4 clip_pos = float4( @@ -115,25 +113,21 @@ void vert(in uint vid : SV_VertexID, v2f.worldPos = rotateVec(instance_data.rotation, instance_data.scale * vert.position) + instance_data.position; v2f.position = clip_pos; - v2f.uv = vert.uv; + v2f.uv = float2(vert.uv.x, 1.0f - vert.uv.y); v2f.worldNormal = rotateVec(instance_data.rotation, vert.normal); v2f.worldIdx = instance_data.worldID; v2f.viewIdx = draw_data.viewID; v2f.objectIdx = instance_data.objectID; - if (instance_data.matID == -2) { - v2f.materialIdx = -2; - v2f.color = instance_data.color; - } else if (instance_data.matID == -1) { + if (instance_data.matID == -1) { v2f.materialIdx = meshDataBuffer[draw_data.meshID].materialIndex; - v2f.color = 0; } else { v2f.materialIdx = instance_data.matID; - v2f.color = 0; } } -float calculateLightAttenuating(ShaderLightData light, float3 worldPos) { +float calculateLightAttenuating(ShaderLightData light, float3 worldPos) +{ if (light.isDirectional) { // Directional light return 1.0f; } else { // Spot light @@ -142,7 +136,8 @@ float calculateLightAttenuating(ShaderLightData light, float3 worldPos) { } } -float3 calculateRayDirection(ShaderLightData light, float3 worldPos) { +float3 calculateRayDirection(ShaderLightData light, float3 worldPos) +{ if (light.isDirectional) { // Directional light return normalize(light.direction.xyz); } else { // Spot light @@ -157,7 +152,8 @@ float3 calculateRayDirection(ShaderLightData light, float3 worldPos) { } } -float4 getShadowMapPixelScaleOffset(uint view_idx, uint2 shadow_map_dim) { +float4 getShadowMapPixelScaleOffset(uint view_idx, uint2 shadow_map_dim) +{ uint num_views_per_image = pushConst.maxShadowMapsXPerTarget * pushConst.maxShadowMapsYPerTarget; @@ -174,7 +170,8 @@ float4 getShadowMapPixelScaleOffset(uint view_idx, uint2 shadow_map_dim) { return float4(scale, offset); } -float linear_step(float low, float high, float v) { +float linear_step(float low, float high, float v) +{ return clamp((v - low) / (high - low), 0, 1); } @@ -197,8 +194,7 @@ float samplePCF(uint shadow_map_target_idx, float2 uv, float z) float4 calculuateLightSpacePosition(float3 world_pos, uint view_idx) { float4 world_pos_v4 = float4(world_pos.xyz, 1.f); - float4 ls_pos = mul(shadowViewDataBuffer[view_idx].viewProjectionMatrix, - world_pos_v4); + float4 ls_pos = mul(shadowViewDataBuffer[view_idx].viewProjectionMatrix, world_pos_v4); ls_pos.xyz /= ls_pos.w; return ls_pos; @@ -278,47 +274,47 @@ PixelOutput frag(in V2F v2f, in uint prim_id : SV_PrimitiveID) output.rgbOut = float4(0.0, 0.0, 0.0, 1.0); } else { + MaterialData mat_data = materialBuffer[v2f.materialIdx]; + float4 color = mat_data.color; + + int texture_idx = -1; + uint texture_count = mat_data.textureCount; + if (texture_count > 0) { + uint texture_start = mat_data.textureOffset; + texture_idx = materialTexturesIndices[texture_start + v2f.worldIdx % texture_count]; + } + if (texture_idx != -1) { + color *= materialTexturesArray[texture_idx].Sample(linearSampler, v2f.uv); + } + + float3 totalLighting = float3(0.f, 0.f, 0.f); + uint numLights = pushConst.numLights; + float shadowFactor = shadowFactorVSM(v2f.worldPos, v2f.viewIdx); - if (v2f.materialIdx == -2) { - output.rgbOut = hexToRgb(v2f.color); - } else { - MaterialData mat_data = materialBuffer[v2f.materialIdx]; - float4 color = mat_data.color; + [unroll(1)] + for (uint i = 0; i < numLights; i++) { + ShaderLightData light = unpackLightData(lightDataBuffer[v2f.worldIdx * numLights + i]); + if (!light.active) { + continue; + } - if (mat_data.textureIdx != -1) { - color *= materialTexturesArray[mat_data.textureIdx].Sample( - linearSampler, v2f.uv); + float3 ray_dir = calculateRayDirection(light, v2f.worldPos); + if (all(ray_dir == float3(0, 0, 0))) { + continue; } - float3 totalLighting = float3(0.f, 0.f, 0.f); - uint numLights = pushConst.numLights; - float shadowFactor = shadowFactorVSM(v2f.worldPos, v2f.viewIdx); - - [unroll(1)] - for (uint i = 0; i < numLights; i++) { - ShaderLightData light = unpackLightData(lightDataBuffer[v2f.worldIdx * numLights + i]); - if (!light.active) { - continue; - } - - float3 ray_dir = calculateRayDirection(light, v2f.worldPos); - if (all(ray_dir == float3(0, 0, 0))) { - continue; - } - - float n_dot_l = max(0.0, dot(normal, -ray_dir)); - float attenuating_factor = calculateLightAttenuating(light, v2f.worldPos); - totalLighting += attenuating_factor * hexToRgb(light.color).rgb * n_dot_l * light.intensity; - - // Apply shadow to the shadowed light. Only support one shadow per view for now. - if (i == shadowViewDataBuffer[v2f.viewIdx].lightIdx) { - totalLighting *= shadowFactor; - } + float n_dot_l = max(0.0, dot(normal, -ray_dir)); + float attenuating_factor = calculateLightAttenuating(light, v2f.worldPos); + totalLighting += attenuating_factor * hexToRgb(light.color).rgb * n_dot_l * light.intensity; + + // Apply shadow to the shadowed light. Only support one shadow per view for now. + if (i == shadowViewDataBuffer[v2f.viewIdx].lightIdx) { + totalLighting *= shadowFactor; } - - color.rgb = (totalLighting + ambient) * color.rgb; - output.rgbOut = color; } + + color.rgb = (totalLighting + ambient) * color.rgb; + output.rgbOut = color; } if (renderOptions.outputs[2]) { diff --git a/src/render/shaders/draw_gbuffer.hlsl b/src/render/shaders/draw_gbuffer.hlsl index 3181218d..22224574 100644 --- a/src/render/shaders/draw_gbuffer.hlsl +++ b/src/render/shaders/draw_gbuffer.hlsl @@ -29,6 +29,9 @@ Texture2D materialTexturesArray[]; [[vk::binding(1, 2)]] SamplerState linearSampler; +[[vk::binding(2, 2)]] +StructuredBuffer materialTexturesIndices; + struct V2F { [[vk::location(0)]] float3 normal : TEXCOORD0; [[vk::location(1)]] float3 position : TEXCOORD1; @@ -51,11 +54,8 @@ float4 vert(in uint vid : SV_VertexID, float4 color = materialBuffer[vert.materialIdx].color; uint instance_id = draw_data.instanceID; - PerspectiveCameraData view_data = - unpackViewData(viewDataBuffer[push_const.viewIdx]); - - EngineInstanceData instance_data = unpackEngineInstanceData( - engineInstanceBuffer[instance_id]); + PerspectiveCameraData view_data =unpackViewData(viewDataBuffer[push_const.viewIdx]); + EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[instance_id]); float3 to_view_translation; float4 to_view_rotation; diff --git a/src/render/shaders/shader_common.h b/src/render/shaders/shader_common.h index 178e89e6..17136283 100644 --- a/src/render/shaders/shader_common.h +++ b/src/render/shaders/shader_common.h @@ -163,13 +163,11 @@ struct MeshData { struct MaterialData { // For now, just a color float4 color; - - int32_t textureIdx; + int32_t textureOffset; + int32_t numTextures; float roughness; float metalness; - - int32_t pad[1]; }; struct ObjectData { @@ -192,7 +190,6 @@ struct EngineInstanceData { int32_t matID; int32_t objectID; int32_t worldID; - uint32_t color; }; struct PackedViewData { @@ -275,7 +272,6 @@ struct DrawCmd { struct DrawData { int instanceID; int materialID; - uint32_t color; }; struct RenderOptions { diff --git a/src/render/shaders/shader_utils.hlsl b/src/render/shaders/shader_utils.hlsl index 58183922..5d8b3715 100644 --- a/src/render/shaders/shader_utils.hlsl +++ b/src/render/shaders/shader_utils.hlsl @@ -75,7 +75,6 @@ EngineInstanceData unpackEngineInstanceData(PackedInstanceData packed) o.matID = asint(d2.z); o.objectID = asint(d2.w); o.worldID = asint(d3.x); - o.color = asuint(d3.y); return o; } diff --git a/src/render/shaders/viewer_cull.hlsl b/src/render/shaders/viewer_cull.hlsl index 60b98a3f..e85be5b8 100644 --- a/src/render/shaders/viewer_cull.hlsl +++ b/src/render/shaders/viewer_cull.hlsl @@ -79,9 +79,7 @@ void instanceCull(uint3 tid : SV_DispatchThreadID, uint current_instance_idx = sm.instancesOffset + local_idx; - EngineInstanceData instance_data = unpackEngineInstanceData( - engineInstanceBuffer[current_instance_idx]); - + EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[current_instance_idx]); ObjectData obj = objectDataBuffer[instance_data.objectID]; uint draw_offset; @@ -105,9 +103,7 @@ void instanceCull(uint3 tid : SV_DispatchThreadID, } else { draw_data.materialID = instance_data.matID; } - draw_data.instanceID = current_instance_idx; - draw_data.color = instance_data.color; drawCommandBuffer[draw_id] = draw_cmd; drawDataBuffer[draw_id] = draw_data; diff --git a/src/render/shaders/viewer_draw.hlsl b/src/render/shaders/viewer_draw.hlsl index 786e5508..9e250a1a 100644 --- a/src/render/shaders/viewer_draw.hlsl +++ b/src/render/shaders/viewer_draw.hlsl @@ -35,15 +35,15 @@ Texture2D materialTexturesArray[]; [[vk::binding(1, 2)]] SamplerState linearSampler; +[[vk::binding(2, 2)]] +StructuredBuffer materialTexturesIndices; + struct V2F { [[vk::location(0)]] float3 normal : TEXCOORD0; [[vk::location(1)]] float3 position : TEXCOORD1; - [[vk::location(2)]] float4 color : TEXCOORD2; [[vk::location(3)]] float dummy : TEXCOORD3; [[vk::location(4)]] float2 uv : TEXCOORD4; - [[vk::location(5)]] int texIdx : TEXCOORD5; - [[vk::location(6)]] float roughness : TEXCOORD6; - [[vk::location(7)]] float metalness : TEXCOORD7; + [[vk::location(3)]] int materialIdx : TEXCOORD2; }; PerspectiveCameraData getCameraData() @@ -72,24 +72,21 @@ float4 vert(in uint vid : SV_VertexID, out V2F v2f) : SV_Position { DrawData draw_data = drawDataBuffer[draw_id]; - Vertex vert = unpackVertex(vertexDataBuffer[vid]); uint instance_id = draw_data.instanceID; - EngineInstanceData instance_data = unpackEngineInstanceData( - engineInstanceBuffer[instance_id]); - + EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[instance_id]); PerspectiveCameraData view_data = getCameraData(); float3 to_view_translation; float4 to_view_rotation; - computeCompositeTransform(instance_data.position, instance_data.rotation, + computeCompositeTransform( + instance_data.position, instance_data.rotation, view_data.pos, view_data.rot, - to_view_translation, to_view_rotation); + to_view_translation, to_view_rotation + ); - float3 view_pos = - rotateVec(to_view_rotation, instance_data.scale * vert.position) + - to_view_translation; + float3 view_pos = rotateVec(to_view_rotation, instance_data.scale * vert.position) + to_view_translation; #if 0 float4 clip_pos = float4( @@ -135,34 +132,11 @@ float4 vert(in uint vid : SV_VertexID, view_pos.y); } - v2f.normal = normalize( - rotateVec(instance_data.rotation, (vert.normal / instance_data.scale))); - v2f.uv = vert.uv; - - v2f.position = rotateVec(instance_data.rotation, - instance_data.scale * vert.position) + instance_data.position; + v2f.normal = normalize(rotateVec(instance_data.rotation, (vert.normal / instance_data.scale))); + v2f.uv = float2(vert.uv.x, 1.0f - vert.uv.y); + v2f.position = rotateVec(instance_data.rotation, instance_data.scale * vert.position) + instance_data.position; v2f.dummy = shadowViewDataBuffer[0].viewProjectionMatrix[0][0]; - - - - v2f.texIdx = -1; - // Defaults for now - v2f.roughness = 0.8; - v2f.metalness = 0.2; - - if (draw_data.materialID == -2) { - v2f.color = hexToRgb(draw_data.color); - } else { - int32_t material_id = draw_data.materialID; - - float4 color = materialBuffer[material_id].color; - - // Material - v2f.color = color; - v2f.texIdx = materialBuffer[material_id].textureIdx; - v2f.roughness = materialBuffer[material_id].roughness; - v2f.metalness = materialBuffer[material_id].metalness; - } + v2f.materialIdx = draw_data.materialID; return clip_pos; } @@ -177,18 +151,25 @@ struct PixelOutput { PixelOutput frag(in V2F v2f) { PixelOutput output; - output.color = v2f.color; - output.color.a = v2f.roughness; - output.normal = float4(normalize(v2f.normal), 1.f); - output.position = float4(v2f.position, v2f.dummy * 0.0000001f); - output.position.a += v2f.metalness; - // output.color.rgb = v2f.normal.xyz; + float metalness = mat_data.metalness; + float roughness = mat_data.roughness; + MaterialData mat_data = materialBuffer[v2f.materialIdx]; + float4 color = mat_data.color; - if (v2f.texIdx != -1) { - output.color *= materialTexturesArray[v2f.texIdx].SampleLevel( - linearSampler, float2(v2f.uv.x, 1.f - v2f.uv.y), 0); + int texture_idx = -1; + uint texture_count = mat_data.textureCount; + if (texture_count > 0) { + uint texture_start = mat_data.textureOffset; + texture_idx = materialTexturesIndices[texture_start + v2f.worldID % texture_count]; } - + if (texture_idx != -1) { + color *= materialTexturesArray[texture_idx].SampleLevel(linearSampler, v2f.uv, 0); + } + + output.color = color; + output.color.a = roughness; + output.normal = float4(normalize(v2f.normal), 1.f); + output.position = float4(v2f.position, v2f.dummy * 0.0000001f + metalness); return output; } diff --git a/src/render/shaders/voxel_draw.hlsl b/src/render/shaders/voxel_draw.hlsl index 76c96f4e..7b7b78bc 100644 --- a/src/render/shaders/voxel_draw.hlsl +++ b/src/render/shaders/voxel_draw.hlsl @@ -23,6 +23,9 @@ Texture2D materialTexturesArray[]; [[vk::binding(1, 1)]] SamplerState linearSampler; +[[vk::binding(2, 1)]] +StructuredBuffer materialTexturesIndices; + struct V2F { [[vk::location(0)]] float3 normal : TEXCOORD0; [[vk::location(1)]] float3 position : TEXCOORD1; From 4acad9b957151a3c8cd5069f37e7d1a5b0d1d879 Mon Sep 17 00:00:00 2001 From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com> Date: Thu, 4 Dec 2025 00:50:33 -0500 Subject: [PATCH 02/11] fix(1) --- include/madrona/importer.hpp | 2 +- src/bridge/bindings.cpp | 62 ++++++------- src/bridge/mgr.cpp | 171 ++++++++++++++++++++--------------- src/importer/gltf.cpp | 4 +- src/render/render_ctx.cpp | 57 ++++-------- 5 files changed, 147 insertions(+), 149 deletions(-) diff --git a/include/madrona/importer.hpp b/include/madrona/importer.hpp index 731035f4..9ff36dbd 100644 --- a/include/madrona/importer.hpp +++ b/include/madrona/importer.hpp @@ -105,7 +105,7 @@ struct ImportedAssets { DynArray objects; DynArray materials; - DynArray materialTextures; + DynArray materialTextures; DynArray instances; DynArray textures; }; diff --git a/src/bridge/bindings.cpp b/src/bridge/bindings.cpp index e5948391..c928e429 100644 --- a/src/bridge/bindings.cpp +++ b/src/bridge/bindings.cpp @@ -150,39 +150,39 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { nb::arg("visualizer_gpu_handles") = nb::none(), nb::keep_alive<1, 31>()) .def("init", [](Manager &mgr, - nb::ndarray> geom_pos, - nb::ndarray> geom_rot, - nb::ndarray> cam_pos, - nb::ndarray> cam_rot, - nb::ndarray> mat_ids, - // nb::ndarray> geom_rgb, - nb::ndarray> geom_sizes, - nb::ndarray> light_pos, - nb::ndarray> light_dir, - nb::ndarray> light_rgb, - nb::ndarray> light_isdir, - nb::ndarray> light_castshadow, - nb::ndarray> light_cutoff, - nb::ndarray> light_attenuation, - nb::ndarray> light_intensity) - + nb::ndarray> geom_pos, + nb::ndarray> geom_rot, + nb::ndarray> cam_pos, + nb::ndarray> cam_rot, + nb::ndarray> mat_ids, + nb::ndarray> geom_rgb, + nb::ndarray> geom_sizes, + nb::ndarray> light_pos, + nb::ndarray> light_dir, + nb::ndarray> light_rgb, + nb::ndarray> light_isdir, + nb::ndarray> light_castshadow, + nb::ndarray> light_cutoff, + nb::ndarray> light_attenuation, + nb::ndarray> light_intensity) { mgr.init( - reinterpret_cast(geom_pos.data()), - reinterpret_cast(geom_rot.data()), - reinterpret_cast(cam_pos.data()), - reinterpret_cast(cam_rot.data()), - reinterpret_cast(mat_ids.data()), - // reinterpret_cast(geom_rgb.data()), - reinterpret_cast(geom_sizes.data()), - reinterpret_cast(light_pos.data()), - reinterpret_cast(light_dir.data()), - reinterpret_cast(light_rgb.data()), - reinterpret_cast(light_isdir.data()), - reinterpret_cast(light_castshadow.data()), - reinterpret_cast(light_cutoff.data()), - reinterpret_cast(light_attenuation.data()), - reinterpret_cast(light_intensity.data())); + reinterpret_cast(geom_pos.data()) if geom_pos.shape(0) > 0 else nullptr, + reinterpret_cast(geom_rot.data()) if geom_rot.shape(0) > 0 else nullptr, + reinterpret_cast(cam_pos.data()) if cam_pos.shape(0) > 0 else nullptr, + reinterpret_cast(cam_rot.data()) if cam_rot.shape(0) > 0 else nullptr, + reinterpret_cast(mat_ids.data()) if mat_ids.shape(0) > 0 else nullptr, + reinterpret_cast(geom_rgb.data()) if geom_rgb.shape(0) > 0 else nullptr, + reinterpret_cast(geom_sizes.data()) if geom_sizes.shape(0) > 0 else nullptr, + reinterpret_cast(light_pos.data()) if light_pos.shape(0) > 0 else nullptr, + reinterpret_cast(light_dir.data()) if light_dir.shape(0) > 0 else nullptr, + reinterpret_cast(light_rgb.data()) if light_rgb.shape(0) > 0 else nullptr, + reinterpret_cast(light_isdir.data()) if light_isdir.shape(0) > 0 else nullptr, + reinterpret_cast(light_castshadow.data()) if light_castshadow.shape(0) > 0 else nullptr, + reinterpret_cast(light_cutoff.data()) if light_cutoff.shape(0) > 0 else nullptr, + reinterpret_cast(light_attenuation.data()) if light_attenuation.shape(0) > 0 else nullptr, + reinterpret_cast(light_intensity.data()) if light_intensity.shape(0) > 0 else nullptr, + ); }) .def("render", [](Manager &mgr, nb::ndarray> geom_pos, diff --git a/src/bridge/mgr.cpp b/src/bridge/mgr.cpp index 167bb256..cbc26392 100644 --- a/src/bridge/mgr.cpp +++ b/src/bridge/mgr.cpp @@ -154,26 +154,33 @@ struct Manager::Impl { Quat *cam_rotations, cudaStream_t strm) { - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::InstancePositions), - geom_positions, - sizeof(Vector3) * numGeoms * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::InstanceRotations), - geom_rotations, - sizeof(Quat) * numGeoms * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::CameraPositions), - cam_positions, - sizeof(Vector3) * numCams * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::CameraRotations), - cam_rotations, - sizeof(Quat) * numCams * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); + uint32_t total_geoms = numGeoms * cfg.numWorlds; + if (geom_positions != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::InstancePositions), + geom_positions, sizeof(Vector3) * total_geoms, cudaMemcpyDeviceToDevice, strm + ); + } + if (geom_rotations != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::InstanceRotations), + geom_rotations, sizeof(Quat) * total_geoms, cudaMemcpyDeviceToDevice, strm + ); + } + + uint32_t total_cams = numCams * cfg.numWorlds; + if (cam_positions != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::CameraPositions), + cam_positions, sizeof(Vector3) * total_cams, cudaMemcpyDeviceToDevice, strm + ); + } + if (cam_rotations != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::CameraRotations), + cam_rotations, sizeof(Quat) * total_cams, cudaMemcpyDeviceToDevice, strm + ); + } } inline void copyInProperties( @@ -190,63 +197,77 @@ struct Manager::Impl { float *light_intensity, cudaStream_t strm) { - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::InstanceMatOverrides), - mat_overrides, - sizeof(MaterialOverride) * numGeoms * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::InstanceColorOverrides), - col_overrides, - sizeof(ColorOverride) * numGeoms * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::InstanceScales), - geom_sizes, - sizeof(Diag3x3) * numGeoms * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); + uint32_t total_geoms = numGeoms * cfg.numWorlds; + if (mat_overrides != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::InstanceMatOverrides), + mat_overrides, sizeof(MaterialOverride) * total_geoms, cudaMemcpyDeviceToDevice, strm + ); + } + // TODO: Remove ColorOverride from ECS + if (col_overrides != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::InstanceColorOverrides), + col_overrides, sizeof(ColorOverride) * total_geoms, cudaMemcpyDeviceToDevice, strm + ); + } + if (geom_sizes != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::InstanceScales), + geom_sizes, sizeof(Diag3x3) * total_geoms, cudaMemcpyDeviceToDevice, strm + ); + } // Copy light properties to GPU - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::LightPositions), - light_pos, - sizeof(Vector3) * numLights * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::LightDirections), - light_dir, - sizeof(Vector3) * numLights * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::LightColors), - light_color, - sizeof(ColorOverride) * numLights * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::LightTypes), - light_isdir, - sizeof(bool) * numLights * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::LightShadows), - light_castshadow, - sizeof(bool) * numLights * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::LightCutoffAngles), - light_cutoff, - sizeof(float) * numLights * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::LightAttenuations), - light_attenuation, - sizeof(float) * numLights * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); - cudaMemcpyAsync( - gpuExec.getExported((CountT)ExportID::LightIntensities), - light_intensity, - sizeof(float) * numLights * cfg.numWorlds, - cudaMemcpyDeviceToDevice, strm); + uint32_t total_lights = numLights * cfg.numWorlds; + if (light_pos != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::LightPositions), + light_pos, sizeof(Vector3) * total_lights, cudaMemcpyDeviceToDevice, strm + ); + } + if (light_dir != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::LightDirections), + light_dir, sizeof(Vector3) * total_lights, cudaMemcpyDeviceToDevice, strm + ); + } + if (light_color != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::LightColors), + light_color, sizeof(ColorOverride) * total_lights, cudaMemcpyDeviceToDevice, strm + ); + } + if (light_isdir != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::LightTypes), + light_isdir, sizeof(bool) * total_lights, cudaMemcpyDeviceToDevice, strm + ); + } + if (light_castshadow != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::LightShadows), + light_castshadow, sizeof(bool) * total_lights, cudaMemcpyDeviceToDevice, strm + ); + } + if (light_cutoff != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::LightCutoffAngles), + light_cutoff, sizeof(float) * total_lights, cudaMemcpyDeviceToDevice, strm + ); + } + if (light_attenuation != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::LightAttenuations), + light_attenuation, sizeof(float) * total_lights, cudaMemcpyDeviceToDevice, strm + ); + } + if (light_intensity != nullptr) { + cudaMemcpyAsync( + gpuExec.getExported((CountT)ExportID::LightIntensities), + light_intensity, sizeof(float) * total_lights, cudaMemcpyDeviceToDevice, strm + ); + } } inline void init(const Vector3 *geom_positions, diff --git a/src/importer/gltf.cpp b/src/importer/gltf.cpp index 1532be5e..c7b656cf 100644 --- a/src/importer/gltf.cpp +++ b/src/importer/gltf.cpp @@ -1811,10 +1811,10 @@ static bool gltfImportAssets(LoaderData &loader, int32_t texture_id = material.baseColorIdx; if (texture_id != -1) { texture_id += prev_tex_idx; - imported.materialTextures.push_back(uint32_t(texture_id)); + imported.materialTextures.push_back(texture_id); imported.materials.emplace_back(SourceMaterial { .color = material.baseColor, - .textureIdx = &imported.materialTextures[imported.materialTextures.size() - 1], + .textureIdx = imported.materialTextures.data() + (imported.materialTextures.size() - 1), .numTextures = 1, .roughness = material.roughness, .metalness = material.metallic, diff --git a/src/render/render_ctx.cpp b/src/render/render_ctx.cpp index d190e842..21b7d435 100644 --- a/src/render/render_ctx.cpp +++ b/src/render/render_ctx.cpp @@ -55,7 +55,7 @@ using namespace vk; using Vertex = render::shader::Vertex; using PackedVertex = render::shader::PackedVertex; using MeshData = render::shader::MeshData; -using MaterialDataShader = render::shader::MaterialData; +using MaterialData = render::shader::MaterialData; using ObjectData = render::shader::ObjectData; using DrawPushConst = render::shader::DrawPushConst; using CullPushConst = render::shader::CullPushConst; @@ -635,11 +635,8 @@ static EngineInterop setupEngineInterop(Device &dev, world_ids_views_base = malloc(sizeof(uint64_t) * num_worlds * max_views_per_world); } else { #ifdef MADRONA_VK_CUDA_SUPPORT - views_gpu = alloc.makeDedicatedBuffer( - num_views_bytes, false, true); - views_cuda.emplace(dev, views_gpu->mem, - num_views_bytes); - + views_gpu = alloc.makeDedicatedBuffer(num_views_bytes, false, true); + views_cuda.emplace(dev, views_gpu->mem, num_views_bytes); views_hdl = views_gpu->buf.buffer; views_base = (char *)views_cuda->getDevicePointer(); #endif @@ -657,12 +654,8 @@ static EngineInterop setupEngineInterop(Device &dev, instances_base = malloc(sizeof(render::shader::PackedInstanceData) * num_worlds * max_instances_per_world); } else { #ifdef MADRONA_VK_CUDA_SUPPORT - instances_gpu = alloc.makeDedicatedBuffer( - num_instances_bytes, false, true); - - instances_cuda.emplace(dev, instances_gpu->mem, - num_instances_bytes); - + instances_gpu = alloc.makeDedicatedBuffer(num_instances_bytes, false, true); + instances_cuda.emplace(dev, instances_gpu->mem, num_instances_bytes); instances_hdl = instances_gpu->buf.buffer; instances_base = (char *)instances_cuda->getDevicePointer(); #endif @@ -678,15 +671,10 @@ static EngineInterop setupEngineInterop(Device &dev, lights_cpu = alloc.makeStagingBuffer(num_lights_bytes); lights_hdl = lights_cpu->buffer; lights_base = malloc(sizeof(render::shader::PackedLightData) * num_worlds * max_lights_per_world); - } - else - { + } else { #ifdef MADRONA_VK_CUDA_SUPPORT - lights_gpu = alloc.makeDedicatedBuffer( - num_lights_bytes, false, true); - lights_cuda.emplace(dev, lights_gpu->mem, - num_lights_bytes); - + lights_gpu = alloc.makeDedicatedBuffer(num_lights_bytes, false, true); + lights_cuda.emplace(dev, lights_gpu->mem, num_lights_bytes); lights_hdl = lights_gpu->buf.buffer; lights_base = (char *)lights_cuda->getDevicePointer(); #endif @@ -703,11 +691,8 @@ static EngineInterop setupEngineInterop(Device &dev, aabb_base = malloc(sizeof(render::shader::AABB) * num_worlds * max_instances_per_world); } else { #ifdef MADRONA_VK_CUDA_SUPPORT - aabb_gpu = alloc.makeDedicatedBuffer( - num_aabb_bytes, false, true); - aabb_cuda.emplace(dev, aabb_gpu->mem, - num_aabb_bytes); - + aabb_gpu = alloc.makeDedicatedBuffer(num_aabb_bytes, false, true); + aabb_cuda.emplace(dev, aabb_gpu->mem, num_aabb_bytes); aabb_hdl = aabb_gpu->buf.buffer; aabb_base = (char *)aabb_cuda->getDevicePointer(); #endif @@ -723,12 +708,8 @@ static EngineInterop setupEngineInterop(Device &dev, instance_offsets_base = instance_offsets_cpu->ptr; } else { #ifdef MADRONA_VK_CUDA_SUPPORT - instance_offsets_gpu = alloc.makeDedicatedBuffer( - num_offsets_bytes, false, true); - - instance_offsets_cuda.emplace(dev, instance_offsets_gpu->mem, - num_offsets_bytes); - + instance_offsets_gpu = alloc.makeDedicatedBuffer(num_offsets_bytes, false, true); + instance_offsets_cuda.emplace(dev, instance_offsets_gpu->mem, num_offsets_bytes); instance_offsets_hdl = instance_offsets_gpu->buf.buffer; instance_offsets_base = (char *)instance_offsets_cuda->getDevicePointer(); #endif @@ -769,8 +750,7 @@ static EngineInterop setupEngineInterop(Device &dev, } } - const uint32_t num_voxels = voxel_config.xLength - * voxel_config.yLength * voxel_config.zLength; + const uint32_t num_voxels = voxel_config.xLength * voxel_config.yLength * voxel_config.zLength; const uint32_t staging_size = num_voxels > 0 ? num_voxels * sizeof(int32_t) : 4; auto voxel_cpu = Optional::none(); @@ -804,8 +784,7 @@ static EngineInterop setupEngineInterop(Device &dev, AtomicU32 *total_num_lights_cpu_inc = nullptr; if (!gpu_input) { - total_num_views_readback = (uint32_t *)malloc( - 3*sizeof(uint32_t)); + total_num_views_readback = (uint32_t *)malloc(3*sizeof(uint32_t)); total_num_instances_readback = total_num_views_readback + 1; total_num_lights_readback = total_num_instances_readback + 1; @@ -855,10 +834,8 @@ static EngineInterop setupEngineInterop(Device &dev, gpu_bridge = nullptr; } else { #ifdef MADRONA_VK_CUDA_SUPPORT - gpu_bridge = (const RenderECSBridge *)cu::allocGPU( - sizeof(RenderECSBridge)); - cudaMemcpy((void *)gpu_bridge, &bridge, sizeof(RenderECSBridge), - cudaMemcpyHostToDevice); + gpu_bridge = (const RenderECSBridge *)cu::allocGPU(sizeof(RenderECSBridge)); + cudaMemcpy((void *)gpu_bridge, &bridge, sizeof(RenderECSBridge), cudaMemcpyHostToDevice); #endif } @@ -2041,7 +2018,7 @@ CountT RenderContext::loadObjects(Span src_objs, float enc_uvs = std::bit_cast(packHalf2x16(uv)); PackedVertex *vertex_data = vertex_ptr + (vertex_offset++); vertex_data->data[0] = Vector4 { pos.x, pos.y, pos.z, enc_nt.x }; - vertex_data->data[1] = Vector4 { enc_nt.y, enc_nt.z, enc_uv, 0 }; + vertex_data->data[1] = Vector4 { enc_nt.y, enc_nt.z, enc_uvs, 0 }; } memcpy(indices_ptr + index_offset, mesh.indices, sizeof(uint32_t) * num_mesh_indices); From 8148376a0200795ba5a56ff78e6ce9926d843060 Mon Sep 17 00:00:00 2001 From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com> Date: Thu, 4 Dec 2025 01:25:46 -0500 Subject: [PATCH 03/11] fix(2) --- src/bridge/bindings.cpp | 1 - src/bridge/mgr.cpp | 57 ++++++++++++++++++++------------------- src/bridge/mgr.hpp | 1 - src/render/render_ctx.cpp | 4 +-- 4 files changed, 31 insertions(+), 32 deletions(-) diff --git a/src/bridge/bindings.cpp b/src/bridge/bindings.cpp index c928e429..34870d35 100644 --- a/src/bridge/bindings.cpp +++ b/src/bridge/bindings.cpp @@ -114,7 +114,6 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { viz_gpu_hdls != nullptr ? *viz_gpu_hdls : Optional::none() ); - free(ptr_geom_rgba); free(ptr_geom_mat_ids); free(ptr_geom_data_ids); }, nb::arg("gpu_id"), diff --git a/src/bridge/mgr.cpp b/src/bridge/mgr.cpp index cbc26392..7cfcca0a 100644 --- a/src/bridge/mgr.cpp +++ b/src/bridge/mgr.cpp @@ -167,7 +167,7 @@ struct Manager::Impl { geom_rotations, sizeof(Quat) * total_geoms, cudaMemcpyDeviceToDevice, strm ); } - + uint32_t total_cams = numCams * cfg.numWorlds; if (cam_positions != nullptr) { cudaMemcpyAsync( @@ -525,33 +525,33 @@ static RTAssets loadRenderObjects( } // Create materials for geoms that do not have one assigned - for (CountT i = 0; i < model.numGeoms; i++) { - if (model.geomMatIDs[i] == -1) { - const math::Vector4 &rgba_i = model.geomRGBA[i]; - SourceMaterial mat = { - .color = math::Vector4{rgba_i.x, rgba_i.y, rgba_i.z, rgba_i.w}, - .textureIdx = nullptr, - .numTextures = 0, - .roughness = 0.8f, - .metalness = 0.2f, - }; - materials.push_back(mat); - model.geomMatIDs[i] = materials.size() - 1; - - for (CountT j = i + 1; j < model.numGeoms; j++) { - // FIX: Should probably implement == op for Vector4 - const math::Vector4 &rgba_j = model.geomRGBA[j]; - if (model.geomMatIDs[j] == -1 && - rgba_i.x == rgba_j.x && - rgba_i.y == rgba_j.y && - rgba_i.z == rgba_j.z && - rgba_i.w == rgba_j.w - ) { - model.geomMatIDs[j] = materials.size() - 1; - } - } - } - } + // for (CountT i = 0; i < model.numGeoms; i++) { + // if (model.geomMatIDs[i] == -1) { + // const math::Vector4 &rgba_i = model.geomRGBA[i]; + // SourceMaterial mat = { + // .color = math::Vector4{rgba_i.x, rgba_i.y, rgba_i.z, rgba_i.w}, + // .textureIdx = nullptr, + // .numTextures = 0, + // .roughness = 0.8f, + // .metalness = 0.2f, + // }; + // materials.push_back(mat); + // model.geomMatIDs[i] = materials.size() - 1; + + // for (CountT j = i + 1; j < model.numGeoms; j++) { + // // FIX: Should probably implement == op for Vector4 + // const math::Vector4 &rgba_j = model.geomRGBA[j]; + // if (model.geomMatIDs[j] == -1 && + // rgba_i.x == rgba_j.x && + // rgba_i.y == rgba_j.y && + // rgba_i.z == rgba_j.z && + // rgba_i.w == rgba_j.w + // ) { + // model.geomMatIDs[j] = materials.size() - 1; + // } + // } + // } + // } HeapArray objs(model.numGeoms + 1); @@ -605,6 +605,7 @@ static RTAssets loadRenderObjects( .numFaces = source_mesh.numFaces, .materialIDX = static_cast(model.geomMatIDs[i]), }; + assert(model.geomMatIDs[i] >= 0); } objs[i] = { diff --git a/src/bridge/mgr.hpp b/src/bridge/mgr.hpp index a887969d..a70b0062 100644 --- a/src/bridge/mgr.hpp +++ b/src/bridge/mgr.hpp @@ -39,7 +39,6 @@ struct GSModel { int32_t *geomMatIDs; int32_t *enabledGeomGroups; madrona::math::Vector3 *geomSizes; - madrona::math::Vector4 *geomRGBA; madrona::math::Vector4 *matRGBA; int32_t *matTexIDs; int32_t *matTexOffsets; diff --git a/src/render/render_ctx.cpp b/src/render/render_ctx.cpp index 21b7d435..ffb0e476 100644 --- a/src/render/render_ctx.cpp +++ b/src/render/render_ctx.cpp @@ -55,7 +55,7 @@ using namespace vk; using Vertex = render::shader::Vertex; using PackedVertex = render::shader::PackedVertex; using MeshData = render::shader::MeshData; -using MaterialData = render::shader::MaterialData; +using MaterialDataShader = render::shader::MaterialData; using ObjectData = render::shader::ObjectData; using DrawPushConst = render::shader::DrawPushConst; using CullPushConst = render::shader::CullPushConst; @@ -2030,7 +2030,7 @@ CountT RenderContext::loadObjects(Span src_objs, int32_t mat_texture_offset = 0; for (const SourceMaterial &mat : src_mats) { int32_t num_mat_textures = (int32_t)mat.numTextures; - MaterialData *mat_data = materials_ptr + (mat_offset++); + MaterialDataShader *mat_data = materials_ptr + (mat_offset++); mat_data->color = mat.color; mat_data->roughness = mat.roughness; mat_data->metalness = mat.metalness; From 625fbefe8202dcb625a8dd10427e01fad6ea2934 Mon Sep 17 00:00:00 2001 From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com> Date: Thu, 4 Dec 2025 01:32:17 -0500 Subject: [PATCH 04/11] fix(3) --- src/bridge/bindings.cpp | 2 +- src/render/asset_processor.cpp | 42 ++++++++++++---------------------- src/render/batch_renderer.cpp | 37 +++++++++++++----------------- 3 files changed, 31 insertions(+), 50 deletions(-) diff --git a/src/bridge/bindings.cpp b/src/bridge/bindings.cpp index 34870d35..acbeeb18 100644 --- a/src/bridge/bindings.cpp +++ b/src/bridge/bindings.cpp @@ -180,7 +180,7 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { reinterpret_cast(light_castshadow.data()) if light_castshadow.shape(0) > 0 else nullptr, reinterpret_cast(light_cutoff.data()) if light_cutoff.shape(0) > 0 else nullptr, reinterpret_cast(light_attenuation.data()) if light_attenuation.shape(0) > 0 else nullptr, - reinterpret_cast(light_intensity.data()) if light_intensity.shape(0) > 0 else nullptr, + reinterpret_cast(light_intensity.data()) if light_intensity.shape(0) > 0 else nullptr ); }) .def("render", [](Manager &mgr, diff --git a/src/render/asset_processor.cpp b/src/render/asset_processor.cpp index 16536051..a2206474 100644 --- a/src/render/asset_processor.cpp +++ b/src/render/asset_processor.cpp @@ -330,14 +330,12 @@ MaterialData initMaterialData( } MaterialData cpu_mat_data = { - .textures = (cudaTextureObject_t *) - malloc(sizeof(cudaTextureObject_t) * num_textures), - .textureBuffers = (cudaArray_t *) - malloc(sizeof(cudaArray_t) * num_non_mipmap_textures), - .mipmapTextureBuffers = (cudaMipmappedArray_t *) - malloc(sizeof(cudaMipmappedArray_t) * num_mipmap_textures), - .materials = (Material *) - malloc(sizeof(Material) * num_materials) + .textures = (cudaTextureObject_t *) malloc(sizeof(cudaTextureObject_t) * num_textures), + .textureBuffers = (cudaArray_t *) malloc(sizeof(cudaArray_t) * num_non_mipmap_textures), + .numTextureBuffers = num_non_mipmap_textures, + .mipmapTextureBuffers = (cudaMipmappedArray_t *) malloc(sizeof(cudaMipmappedArray_t) * num_mipmap_textures), + .numMipmapTextureBuffers = num_mipmap_textures, + .materials = (Material *) malloc(sizeof(Material) * num_materials) }; for (uint32_t i = 0; i < num_textures; ++i) { @@ -349,12 +347,9 @@ MaterialData initMaterialData( width = tex.width; height = tex.height; - cudaChannelFormatDesc channel_desc = - cudaCreateChannelDesc(); - + cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); cudaArray_t cuda_array; - REQ_CUDA(cudaMallocArray(&cuda_array, &channel_desc, - width, height, cudaArrayDefault)); + REQ_CUDA(cudaMallocArray(&cuda_array, &channel_desc, width, height, cudaArrayDefault)); REQ_CUDA(cudaMemcpy2DToArray(cuda_array, 0, 0, tex.data, 16 * width / 4, @@ -375,8 +370,7 @@ MaterialData initMaterialData( tex_desc.sRGB = 1; cudaTextureObject_t tex_obj = 0; - REQ_CUDA(cudaCreateTextureObject(&tex_obj, - &res_desc, &tex_desc, nullptr)); + REQ_CUDA(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, nullptr)); cpu_mat_data.textures[i] = tex_obj; cpu_mat_data.textureBuffers[i] = cuda_array; @@ -443,13 +437,10 @@ MaterialData initMaterialData( } } - cpu_mat_data.numTextureBuffers = num_non_mipmap_textures; - cpu_mat_data.numMipmapTextureBuffers = num_mipmap_textures; - for (uint32_t i = 0; i < num_materials; ++i) { Material mat = { .color = materials[i].color, - .textureIdx = materials[i].textureIdx, + .textureIdx = materials[i].numTextures > 0 ? *(materials[i].textureIdx) : -1, //TODO: support batch .roughness = materials[i].roughness, .metalness = materials[i].metalness, }; @@ -458,15 +449,13 @@ MaterialData initMaterialData( } cudaTextureObject_t *gpu_tex_buffer; - REQ_CUDA(cudaMalloc(&gpu_tex_buffer, - sizeof(cudaTextureObject_t) * num_textures)); + REQ_CUDA(cudaMalloc(&gpu_tex_buffer, sizeof(cudaTextureObject_t) * num_textures)); REQ_CUDA(cudaMemcpy(gpu_tex_buffer, cpu_mat_data.textures, sizeof(cudaTextureObject_t) * num_textures, cudaMemcpyHostToDevice)); Material *mat_buffer; - REQ_CUDA(cudaMalloc(&mat_buffer, - sizeof(Material) * num_materials)); + REQ_CUDA(cudaMalloc(&mat_buffer, sizeof(Material) * num_materials)); REQ_CUDA(cudaMemcpy(mat_buffer, cpu_mat_data.materials, sizeof(Material) * num_materials, cudaMemcpyHostToDevice)); @@ -484,13 +473,10 @@ MaterialData initMaterialData( } #endif -math::AABB *makeAABBs( - Span src_objs) +math::AABB *makeAABBs(Span src_objs) { int num_objects = (int)src_objs.size(); - - math::AABB *aabbs = (math::AABB *)malloc(sizeof(math::AABB) * - num_objects); + math::AABB *aabbs = (math::AABB *)malloc(sizeof(math::AABB) * num_objects); for (int obj_idx = 0; obj_idx < num_objects; ++obj_idx) { auto &obj = src_objs[obj_idx]; diff --git a/src/render/batch_renderer.cpp b/src/render/batch_renderer.cpp index 23bb9789..c2de67f6 100644 --- a/src/render/batch_renderer.cpp +++ b/src/render/batch_renderer.cpp @@ -327,7 +327,7 @@ static PipelineMP<1> makeDrawPipeline(const vk::Device &dev, blend_info.pAttachments = blend_attachments.data(); // Dynamic - std::array dyn_enable {{ + std::array dyn_enable { VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR, }; @@ -2043,11 +2043,10 @@ void BatchRenderer::prepareForRendering(BatchRenderInfo info, } { // Import the views - VkDeviceSize num_views_bytes = info.numViews * - sizeof(shader::PackedViewData); - + VkDeviceSize num_views_bytes = info.numViews * sizeof(shader::PackedViewData); VkBufferCopy view_data_copy = { - .srcOffset = 0, .dstOffset = 0, + .srcOffset = 0, + .dstOffset = 0, .size = num_views_bytes }; @@ -2057,11 +2056,10 @@ void BatchRenderer::prepareForRendering(BatchRenderInfo info, } { // Import the instances - VkDeviceSize num_instances_bytes = info.numInstances * - sizeof(shader::PackedInstanceData); - + VkDeviceSize num_instances_bytes = info.numInstances * sizeof(shader::PackedInstanceData); VkBufferCopy instance_data_copy = { - .srcOffset = 0, .dstOffset = 0, + .srcOffset = 0, + .dstOffset = 0, .size = num_instances_bytes }; @@ -2071,11 +2069,10 @@ void BatchRenderer::prepareForRendering(BatchRenderInfo info, } { // Import the offsets for instances - VkDeviceSize num_offsets_bytes = info.numWorlds * - sizeof(int32_t); - + VkDeviceSize num_offsets_bytes = info.numWorlds * sizeof(int32_t); VkBufferCopy offsets_data_copy = { - .srcOffset = 0, .dstOffset = 0, + .srcOffset = 0, + .dstOffset = 0, .size = num_offsets_bytes }; @@ -2086,11 +2083,10 @@ void BatchRenderer::prepareForRendering(BatchRenderInfo info, #if 0 { // Import the aabbs for instances - VkDeviceSize num_aabbs_bytes = info.numInstances * - sizeof(shader::AABB); - + VkDeviceSize num_aabbs_bytes = info.numInstances * sizeof(shader::AABB); VkBufferCopy aabb_data_copy = { - .srcOffset = 0, .dstOffset = 0, + .srcOffset = 0, + .dstOffset = 0, .size = num_aabbs_bytes }; @@ -2101,11 +2097,10 @@ void BatchRenderer::prepareForRendering(BatchRenderInfo info, #endif { // Import the offsets for views - VkDeviceSize num_offsets_bytes = info.numWorlds * - sizeof(int32_t); - + VkDeviceSize num_offsets_bytes = info.numWorlds * sizeof(int32_t); VkBufferCopy offsets_data_copy = { - .srcOffset = 0, .dstOffset = 0, + .srcOffset = 0, + .dstOffset = 0, .size = num_offsets_bytes }; From 988e566de775758d9387bea855c4e406ad5f6d0b Mon Sep 17 00:00:00 2001 From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com> Date: Thu, 4 Dec 2025 02:42:19 -0500 Subject: [PATCH 05/11] fix(4) --- src/bridge/bindings.cpp | 30 ++++++++++++++-------------- src/bridge/gs_madrona/renderer_gs.py | 4 +--- src/render/shaders/viewer_draw.hlsl | 10 ++++++---- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/bridge/bindings.cpp b/src/bridge/bindings.cpp index acbeeb18..0c55d79c 100644 --- a/src/bridge/bindings.cpp +++ b/src/bridge/bindings.cpp @@ -166,21 +166,21 @@ NB_MODULE(_gs_madrona_batch_renderer, m) { nb::ndarray> light_intensity) { mgr.init( - reinterpret_cast(geom_pos.data()) if geom_pos.shape(0) > 0 else nullptr, - reinterpret_cast(geom_rot.data()) if geom_rot.shape(0) > 0 else nullptr, - reinterpret_cast(cam_pos.data()) if cam_pos.shape(0) > 0 else nullptr, - reinterpret_cast(cam_rot.data()) if cam_rot.shape(0) > 0 else nullptr, - reinterpret_cast(mat_ids.data()) if mat_ids.shape(0) > 0 else nullptr, - reinterpret_cast(geom_rgb.data()) if geom_rgb.shape(0) > 0 else nullptr, - reinterpret_cast(geom_sizes.data()) if geom_sizes.shape(0) > 0 else nullptr, - reinterpret_cast(light_pos.data()) if light_pos.shape(0) > 0 else nullptr, - reinterpret_cast(light_dir.data()) if light_dir.shape(0) > 0 else nullptr, - reinterpret_cast(light_rgb.data()) if light_rgb.shape(0) > 0 else nullptr, - reinterpret_cast(light_isdir.data()) if light_isdir.shape(0) > 0 else nullptr, - reinterpret_cast(light_castshadow.data()) if light_castshadow.shape(0) > 0 else nullptr, - reinterpret_cast(light_cutoff.data()) if light_cutoff.shape(0) > 0 else nullptr, - reinterpret_cast(light_attenuation.data()) if light_attenuation.shape(0) > 0 else nullptr, - reinterpret_cast(light_intensity.data()) if light_intensity.shape(0) > 0 else nullptr + geom_pos.shape(0) > 0 ? reinterpret_cast(geom_pos.data()) : nullptr, + geom_rot.shape(0) > 0 ? reinterpret_cast(geom_rot.data()) : nullptr, + cam_pos.shape(0) > 0 ? reinterpret_cast(cam_pos.data()) : nullptr, + cam_rot.shape(0) > 0 ? reinterpret_cast(cam_rot.data()) : nullptr, + mat_ids.shape(0) > 0 ? reinterpret_cast(mat_ids.data()) : nullptr, + geom_rgb.shape(0) > 0 ? reinterpret_cast(geom_rgb.data()) : nullptr, + geom_sizes.shape(0) > 0 ? reinterpret_cast(geom_sizes.data()) : nullptr, + light_pos.shape(0) > 0 ? reinterpret_cast(light_pos.data()) : nullptr, + light_dir.shape(0) > 0 ? reinterpret_cast(light_dir.data()) : nullptr, + light_rgb.shape(0) > 0 ? reinterpret_cast(light_rgb.data()) : nullptr, + light_isdir.shape(0) > 0 ? reinterpret_cast(light_isdir.data()) : nullptr, + light_castshadow.shape(0) > 0 ? reinterpret_cast(light_castshadow.data()) : nullptr, + light_cutoff.shape(0) > 0 ? reinterpret_cast(light_cutoff.data()) : nullptr, + light_attenuation.shape(0) > 0 ? reinterpret_cast(light_attenuation.data()) : nullptr, + light_intensity.shape(0) > 0 ? reinterpret_cast(light_intensity.data()) : nullptr ); }) .def("render", [](Manager &mgr, diff --git a/src/bridge/gs_madrona/renderer_gs.py b/src/bridge/gs_madrona/renderer_gs.py index b2a9f7fb..377463ea 100644 --- a/src/bridge/gs_madrona/renderer_gs.py +++ b/src/bridge/gs_madrona/renderer_gs.py @@ -31,7 +31,6 @@ def __init__( geom_retriever: GeomRetriever, gpu_id: int, num_worlds: int, - num_cameras: int, num_lights: int, cam_fovs_tensor: torch.Tensor, cam_znears_tensor: torch.Tensor, @@ -44,7 +43,7 @@ def __init__( assert geom_retriever is not None, "GeomRetriever is required for MadronaBatchRendererAdapter" assert gpu_id >= 0, "GPU ID must be greater than or equal to 0" assert num_worlds > 0, "Number of worlds must be greater than 0" - assert num_cameras > 0, "Must have at least one camera for Madrona to work!" + assert cam_fovs_tensor.shape[0] > 0, "Must have at least one camera for Madrona to work!" assert batch_render_view_width > 0, "Batch render view width must be greater than 0" assert batch_render_view_height > 0, "Batch render view height must be greater than 0" @@ -72,7 +71,6 @@ def __init__( gpu_id=gpu_id, **geom_args_static, num_lights=num_lights, - num_cams=num_cameras, num_worlds=num_worlds, batch_render_view_width=batch_render_view_width, batch_render_view_height=batch_render_view_height, diff --git a/src/render/shaders/viewer_draw.hlsl b/src/render/shaders/viewer_draw.hlsl index 9e250a1a..ebe1761f 100644 --- a/src/render/shaders/viewer_draw.hlsl +++ b/src/render/shaders/viewer_draw.hlsl @@ -41,9 +41,10 @@ StructuredBuffer materialTexturesIndices; struct V2F { [[vk::location(0)]] float3 normal : TEXCOORD0; [[vk::location(1)]] float3 position : TEXCOORD1; - [[vk::location(3)]] float dummy : TEXCOORD3; - [[vk::location(4)]] float2 uv : TEXCOORD4; - [[vk::location(3)]] int materialIdx : TEXCOORD2; + [[vk::location(2)]] float dummy : TEXCOORD3; + [[vk::location(3)]] float2 uv : TEXCOORD4; + [[vk::location(4)]] int materialIdx : TEXCOORD5; + [[vk::location(5)]] float worldIdx : TEXCOORD6; }; PerspectiveCameraData getCameraData() @@ -137,6 +138,7 @@ float4 vert(in uint vid : SV_VertexID, v2f.position = rotateVec(instance_data.rotation, instance_data.scale * vert.position) + instance_data.position; v2f.dummy = shadowViewDataBuffer[0].viewProjectionMatrix[0][0]; v2f.materialIdx = draw_data.materialID; + v2f.worldIdx = instance_data.worldID; return clip_pos; } @@ -161,7 +163,7 @@ PixelOutput frag(in V2F v2f) uint texture_count = mat_data.textureCount; if (texture_count > 0) { uint texture_start = mat_data.textureOffset; - texture_idx = materialTexturesIndices[texture_start + v2f.worldID % texture_count]; + texture_idx = materialTexturesIndices[texture_start + v2f.worldIdx % texture_count]; } if (texture_idx != -1) { color *= materialTexturesArray[texture_idx].SampleLevel(linearSampler, v2f.uv, 0); From e729ea6254d0556a4cc6db0227029219f755fe6a Mon Sep 17 00:00:00 2001 From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com> Date: Thu, 4 Dec 2025 03:15:18 -0500 Subject: [PATCH 06/11] Fix(5) remove materialIdx in Vertex --- src/render/shaders/batch_draw_rgb.hlsl | 2 +- src/render/shaders/draw_gbuffer.hlsl | 149 ------------------------- src/render/shaders/shader_common.h | 1 - src/render/shaders/shader_utils.hlsl | 1 - src/render/shaders/viewer_draw.hlsl | 4 +- 5 files changed, 3 insertions(+), 154 deletions(-) delete mode 100644 src/render/shaders/draw_gbuffer.hlsl diff --git a/src/render/shaders/batch_draw_rgb.hlsl b/src/render/shaders/batch_draw_rgb.hlsl index 377d60f5..554bd40f 100644 --- a/src/render/shaders/batch_draw_rgb.hlsl +++ b/src/render/shaders/batch_draw_rgb.hlsl @@ -278,7 +278,7 @@ PixelOutput frag(in V2F v2f, in uint prim_id : SV_PrimitiveID) float4 color = mat_data.color; int texture_idx = -1; - uint texture_count = mat_data.textureCount; + uint texture_count = mat_data.numTextures; if (texture_count > 0) { uint texture_start = mat_data.textureOffset; texture_idx = materialTexturesIndices[texture_start + v2f.worldIdx % texture_count]; diff --git a/src/render/shaders/draw_gbuffer.hlsl b/src/render/shaders/draw_gbuffer.hlsl deleted file mode 100644 index 22224574..00000000 --- a/src/render/shaders/draw_gbuffer.hlsl +++ /dev/null @@ -1,149 +0,0 @@ -#include "shader_utils.hlsl" - -[[vk::push_constant]] -DrawPushConst push_const; - -[[vk::binding(0, 0)]] -StructuredBuffer viewDataBuffer; - -[[vk::binding(1, 0)]] -StructuredBuffer engineInstanceBuffer; - -[[vk::binding(2, 0)]] -StructuredBuffer drawDataBuffer; - -[[vk::binding(3, 0)]] -StructuredBuffer shadowViewDataBuffer; - -// Asset descriptor bindings -[[vk::binding(0, 1)]] -StructuredBuffer vertexDataBuffer; - -[[vk::binding(1, 1)]] -StructuredBuffer materialBuffer; - -// Texture descriptor bindings -[[vk::binding(0, 2)]] -Texture2D materialTexturesArray[]; - -[[vk::binding(1, 2)]] -SamplerState linearSampler; - -[[vk::binding(2, 2)]] -StructuredBuffer materialTexturesIndices; - -struct V2F { - [[vk::location(0)]] float3 normal : TEXCOORD0; - [[vk::location(1)]] float3 position : TEXCOORD1; - [[vk::location(2)]] float4 color : TEXCOORD2; - [[vk::location(3)]] float dummy : TEXCOORD3; - [[vk::location(4)]] float2 uv : TEXCOORD4; - [[vk::location(5)]] int texIdx : TEXCOORD5; - [[vk::location(6)]] float roughness : TEXCOORD6; - [[vk::location(7)]] float metalness : TEXCOORD7; -}; - -[shader("vertex")] -float4 vert(in uint vid : SV_VertexID, - in uint draw_id : SV_InstanceID, - out V2F v2f) : SV_Position -{ - DrawData draw_data = drawDataBuffer[draw_id]; - - Vertex vert = unpackVertex(vertexDataBuffer[vid]); - float4 color = materialBuffer[vert.materialIdx].color; - uint instance_id = draw_data.instanceID; - - PerspectiveCameraData view_data =unpackViewData(viewDataBuffer[push_const.viewIdx]); - EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[instance_id]); - - float3 to_view_translation; - float4 to_view_rotation; - computeCompositeTransform(instance_data.position, instance_data.rotation, - view_data.pos, view_data.rot, - to_view_translation, to_view_rotation); - - float3 view_pos = - rotateVec(to_view_rotation, instance_data.scale * vert.position) + - to_view_translation; - - float4 clip_pos = float4( - view_data.xScale * view_pos.x, - view_data.yScale * view_pos.z, - view_data.zNear, - view_pos.y); - - // v2f.viewPos = view_pos; -#if 0 - v2f.normal = normalize( - rotateVec(to_view_rotation, (vert.normal / instance_data.scale))); -#endif - v2f.normal = normalize( - rotateVec(instance_data.rotation, (vert.normal / instance_data.scale))); - v2f.uv = vert.uv; - v2f.color = color; - v2f.position = rotateVec(instance_data.rotation, - instance_data.scale * vert.position) + instance_data.position; - v2f.dummy = shadowViewDataBuffer[0].viewProjectionMatrix[0][0]; - v2f.texIdx = materialBuffer[vert.materialIdx].textureIdx; - v2f.roughness = materialBuffer[vert.materialIdx].roughness; - v2f.metalness = materialBuffer[vert.materialIdx].metalness; - - return clip_pos; -} - -struct PixelOutput { - float4 color : SV_Target0; - float4 normal : SV_Target1; - float4 position : SV_Target2; -}; - -[shader("pixel")] -PixelOutput frag(in V2F v2f) -{ - PixelOutput output; - output.color = v2f.color; - output.color.a = v2f.roughness; - output.normal = float4(normalize(v2f.normal), 1.f); - output.position = float4(v2f.position, v2f.dummy * 0.0000001f); - output.position.a += v2f.metalness; - - // output.color.rgb = v2f.normal.xyz; - - if (v2f.texIdx != -1) { - output.color *= materialTexturesArray[v2f.texIdx].SampleLevel( - linearSampler, float2(v2f.uv.x, 1.f - v2f.uv.y), 0); - } - - return output; -} - -#if 0 -DrawInstanceData unpackDrawInstanceData(PackedDrawInstanceData data) -{ - const float4 d0 = data.packed[0]; - const float4 d1 = data.packed[1]; - const float4 d2 = data.packed[2]; - const float4 d3 = data.packed[3]; - const float4 d4 = data.packed[4]; - - DrawInstanceData out; - - float3 rot_col0 = d0.xyz; - float3 rot_col1 = float3(d0.w, d1.xy); - float3 rot_col2 = float3(d1.zw, d2.x); - - out.toViewRot = float3x3( - float3(rot_col0.x, rot_col1.x, rot_col2.x), - float3(rot_col0.y, rot_col1.y, rot_col2.y), - float3(rot_col0.z, rot_col1.z, rot_col2.z), - ); - out.toViewTranslation = d2.yzw; - out.objScale = d3.xyz; - out.viewIdx = asint(d3.w); - out.projScale = d4.xy; - out.projZNear = d4.z; - - return out; -} -#endif diff --git a/src/render/shaders/shader_common.h b/src/render/shaders/shader_common.h index 17136283..6885d881 100644 --- a/src/render/shaders/shader_common.h +++ b/src/render/shaders/shader_common.h @@ -144,7 +144,6 @@ struct Vertex { float3 normal; float4 tangentAndSign; float2 uv; - uint32_t materialIdx; }; struct PackedVertex { diff --git a/src/render/shaders/shader_utils.hlsl b/src/render/shaders/shader_utils.hlsl index 5d8b3715..f1427b19 100644 --- a/src/render/shaders/shader_utils.hlsl +++ b/src/render/shaders/shader_utils.hlsl @@ -56,7 +56,6 @@ Vertex unpackVertex(PackedVertex packed) vert.normal = normal; vert.tangentAndSign = tangent_and_sign; vert.uv = unpackHalf2x16(asuint(d1.z)); - vert.materialIdx = asuint(d1.w); return vert; } diff --git a/src/render/shaders/viewer_draw.hlsl b/src/render/shaders/viewer_draw.hlsl index ebe1761f..d4fcd1b2 100644 --- a/src/render/shaders/viewer_draw.hlsl +++ b/src/render/shaders/viewer_draw.hlsl @@ -154,15 +154,15 @@ PixelOutput frag(in V2F v2f) { PixelOutput output; + MaterialData mat_data = materialBuffer[v2f.materialIdx]; float metalness = mat_data.metalness; float roughness = mat_data.roughness; - MaterialData mat_data = materialBuffer[v2f.materialIdx]; float4 color = mat_data.color; int texture_idx = -1; uint texture_count = mat_data.textureCount; if (texture_count > 0) { - uint texture_start = mat_data.textureOffset; + uint texture_start = mat_data.numTextures; texture_idx = materialTexturesIndices[texture_start + v2f.worldIdx % texture_count]; } if (texture_idx != -1) { From 316723b2e8ef16359fbdf998b87b9ad14ae428fc Mon Sep 17 00:00:00 2001 From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com> Date: Thu, 4 Dec 2025 06:05:16 -0500 Subject: [PATCH 07/11] fix(6) attachment format --- src/mw/device/sort_archetype.cpp | 2 +- src/render/batch_renderer.cpp | 11 ++-- src/render/render_common.hpp | 6 -- src/render/render_ctx.cpp | 85 +++++++++++----------------- src/render/shaders/post_process.hlsl | 2 +- src/render/shaders/viewer_draw.hlsl | 4 +- 6 files changed, 41 insertions(+), 69 deletions(-) diff --git a/src/mw/device/sort_archetype.cpp b/src/mw/device/sort_archetype.cpp index 8a56e6c1..1be54fd8 100644 --- a/src/mw/device/sort_archetype.cpp +++ b/src/mw/device/sort_archetype.cpp @@ -168,7 +168,7 @@ struct BlockRadixRankMatchEarlyCountsCustom for (int u = 0; u < WARP_BINS_PER_THREAD; ++u) { int bin = lane + u * WARP_THREADS; - bins[u] = internal::ThreadReduce(warp_histograms[bin], Sum()); + bins[u] = internal::ThreadReduce(warp_histograms[bin], Sum()); } CTA_SYNC(); diff --git a/src/render/batch_renderer.cpp b/src/render/batch_renderer.cpp index c2de67f6..28aa7724 100644 --- a/src/render/batch_renderer.cpp +++ b/src/render/batch_renderer.cpp @@ -378,15 +378,15 @@ static PipelineMP<1> makeDrawPipeline(const vk::Device &dev, }, }}; std::array colorFormats = { - InternalConfig::componentFormats[0], - InternalConfig::componentFormats[2], - InternalConfig::componentFormats[3] + InternalConfig::componentAttachFormats[0], + InternalConfig::componentAttachFormats[2], + InternalConfig::componentAttachFormats[3] }; VkPipelineRenderingCreateInfo rendering_info = {}; rendering_info.sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO; rendering_info.colorAttachmentCount = 3; rendering_info.pColorAttachmentFormats = colorFormats.data(); - rendering_info.depthAttachmentFormat = InternalConfig::componentFormats[1]; + rendering_info.depthAttachmentFormat = InternalConfig::componentAttachFormats[1]; VkGraphicsPipelineCreateInfo gfx_infos = { @@ -471,8 +471,7 @@ static PipelineMP<1> makeShadowDrawPipeline(const vk::Device &dev, // Depth/Stencil VkPipelineDepthStencilStateCreateInfo depth_info {}; - depth_info.sType = - VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + depth_info.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; depth_info.depthTestEnable = VK_TRUE; depth_info.depthWriteEnable = VK_TRUE; depth_info.depthCompareOp = VK_COMPARE_OP_GREATER_OR_EQUAL; diff --git a/src/render/render_common.hpp b/src/render/render_common.hpp index 1f2b5a9f..334d4530 100644 --- a/src/render/render_common.hpp +++ b/src/render/render_common.hpp @@ -40,12 +40,6 @@ inline constexpr uint32_t componentBytes[maxComponents] = { 4, // VK_FORMAT_R8G8B8A8_UNORM (normals) 4 // VK_FORMAT_R32_SINT (segmentation: 4 bytes per int32) }; -inline constexpr VkFormat componentFormats[maxComponents] = { - VK_FORMAT_R8G8B8A8_UNORM, // RGB - VK_FORMAT_D32_SFLOAT, // depth - VK_FORMAT_R8G8B8A8_UNORM, // normals - VK_FORMAT_R32_SINT // segmentation -}; inline constexpr VkFormat componentAttachFormats[maxComponents] = { VK_FORMAT_R32G32B32A32_SFLOAT, // RGB (float4) VK_FORMAT_D32_SFLOAT, // depth (float) diff --git a/src/render/render_ctx.cpp b/src/render/render_ctx.cpp index ffb0e476..423d3b6c 100644 --- a/src/render/render_ctx.cpp +++ b/src/render/render_ctx.cpp @@ -1420,12 +1420,12 @@ RenderContext::RenderContext( } { - VkDescriptorBindingFlags flags[] = { VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT, 0 }; + VkDescriptorBindingFlags flags[] = { VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT, 0, 0 }; VkDescriptorSetLayoutBindingFlagsCreateInfo flag_info = { .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO, .pNext = nullptr, - .bindingCount = 2, + .bindingCount = 3, .pBindingFlags = flags, }; @@ -2098,102 +2098,81 @@ CountT RenderContext::loadObjects(Span src_objs, } // DynArray desc_updates(9 + (material_textures_.size() > 0 ? 2 : 0)); - DynArray desc_updates(100); + DynArray desc_updates(12); VkDescriptorBufferInfo obj_info; obj_info.buffer = asset_buffer.buffer; obj_info.offset = 0; obj_info.range = buffer_sizes[0]; - - desc_updates.push_back({}); - DescHelper::storage(desc_updates[0], asset_set_cull_, &obj_info, 0); + DescHelper::storage(desc_updates.emplace_back(), asset_set_cull_, &obj_info, 0); VkDescriptorBufferInfo mesh_info; mesh_info.buffer = asset_buffer.buffer; mesh_info.offset = buffer_offsets[0]; mesh_info.range = buffer_sizes[1]; - - desc_updates.push_back({}); - DescHelper::storage(desc_updates[1], asset_set_cull_, &mesh_info, 1); + DescHelper::storage(desc_updates.emplace_back(), asset_set_cull_, &mesh_info, 1); + DescHelper::storage(desc_updates.emplace_back(), asset_batch_lighting_set_, &mesh_info, 1); VkDescriptorBufferInfo vert_info; vert_info.buffer = asset_buffer.buffer; vert_info.offset = buffer_offsets[1]; vert_info.range = buffer_sizes[2]; - - desc_updates.push_back({}); - DescHelper::storage(desc_updates[2], asset_set_draw_, &vert_info, 0); + DescHelper::storage(desc_updates.emplace_back(), asset_set_draw_, &vert_info, 0); + DescHelper::storage(desc_updates.emplace_back(), asset_batch_lighting_set_, &vert_info, 0); VkDescriptorBufferInfo mat_info; mat_info.buffer = asset_buffer.buffer; mat_info.offset = buffer_offsets[3]; mat_info.range = buffer_sizes[4]; - - desc_updates.push_back({}); - DescHelper::storage(desc_updates[3], asset_set_draw_, &mat_info, 1); + DescHelper::storage(desc_updates.emplace_back(), asset_set_draw_, &mat_info, 1); + DescHelper::storage(desc_updates.emplace_back(), asset_batch_lighting_set_, &mat_info, 2); VkDescriptorBufferInfo index_set_info; index_set_info.buffer = asset_buffer.buffer; index_set_info.offset = buffer_offsets[2]; index_set_info.range = buffer_sizes[3]; - - desc_updates.push_back({}); - DescHelper::storage(desc_updates[4], index_buffer_set, &index_set_info, 0); - - desc_updates.push_back({}); - DescHelper::storage(desc_updates[5], asset_batch_lighting_set_, &vert_info, 0); - - desc_updates.push_back({}); - DescHelper::storage(desc_updates[6], asset_batch_lighting_set_, &mesh_info, 1); - - desc_updates.push_back({}); - DescHelper::storage(desc_updates[7], asset_batch_lighting_set_, &mat_info, 2); + DescHelper::storage(desc_updates.emplace_back(), index_buffer_set, &index_set_info, 0); VkDescriptorBufferInfo aabb_set_info; aabb_set_info.buffer = asset_buffer.buffer; aabb_set_info.offset = buffer_offsets[4]; aabb_set_info.range = buffer_sizes[5]; - - desc_updates.push_back({}); - DescHelper::storage(desc_updates[8], aabb_buffer_set, &aabb_set_info, 0); + DescHelper::storage(desc_updates.emplace_back(), aabb_buffer_set, &aabb_set_info, 0); if (textures.size()) { material_textures_ = loadTextures(dev, alloc, renderQueue, textures); } - if (material_textures_.size()) { - DynArray tx_infos(material_textures_.size()); - for (auto &tx : material_textures_) { - tx_infos.push_back({ - .sampler = VK_NULL_HANDLE, - .imageView = tx.view, - .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL - }); - } - desc_updates.push_back({}); - DescHelper::textures(desc_updates[9], asset_set_mat_tex_, tx_infos.data(), tx_infos.size(), 0); - - VkDescriptorBufferInfo mat_tx_info; - mat_tx_info.buffer = asset_buffer.buffer; - mat_tx_info.offset = buffer_offsets[5]; - mat_tx_info.range = buffer_sizes[6]; - - desc_updates.push_back({}); - DescHelper::storage(desc_updates[10], asset_set_mat_tex_, &mat_tx_info, 2); + DynArray tx_infos(material_textures_.size()); + for (const auto &tx : material_textures_) { + tx_infos.push_back({ + .sampler = VK_NULL_HANDLE, + .imageView = tx.view, + .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL + }); + } + if (tx_infos.size()) { + DescHelper::textures(desc_updates.emplace_back(), asset_set_mat_tex_, tx_infos.data(), tx_infos.size(), 0); + } + + VkDescriptorBufferInfo mat_tx_info; + mat_tx_info.buffer = asset_buffer.buffer; + mat_tx_info.offset = buffer_offsets[5]; + mat_tx_info.range = buffer_sizes[6]; + if (tx_infos.size()) { + DescHelper::storage(desc_updates.emplace_back(), asset_set_mat_tex_, &mat_tx_info, 2); } DescHelper::update(dev, desc_updates.data(), desc_updates.size()); - AssetData asset_data { + loaded_assets_.emplace_back(AssetData { std::move(asset_buffer), (uint32_t)buffer_offsets[2], index_buffer_set, buffer_offsets[4], buffer_sizes[5], aabb_buffer_set - }; - - loaded_assets_.emplace_back(std::move(asset_data)); + }); return 0; } diff --git a/src/render/shaders/post_process.hlsl b/src/render/shaders/post_process.hlsl index 9969df1f..65b4c46c 100644 --- a/src/render/shaders/post_process.hlsl +++ b/src/render/shaders/post_process.hlsl @@ -372,7 +372,7 @@ float3 applyFXAA(uint32_t view_idx, int2 coord) { } // ------------------------------------------------------------------------------------------------ -[numThreads(16, 16, 1)] +[numThreads(32, 32, 1)] [shader("compute")] void main(uint3 idx : SV_DispatchThreadID) { diff --git a/src/render/shaders/viewer_draw.hlsl b/src/render/shaders/viewer_draw.hlsl index d4fcd1b2..77129600 100644 --- a/src/render/shaders/viewer_draw.hlsl +++ b/src/render/shaders/viewer_draw.hlsl @@ -160,9 +160,9 @@ PixelOutput frag(in V2F v2f) float4 color = mat_data.color; int texture_idx = -1; - uint texture_count = mat_data.textureCount; + uint texture_count = mat_data.numTextures; if (texture_count > 0) { - uint texture_start = mat_data.numTextures; + uint texture_start = mat_data.textureOffset; texture_idx = materialTexturesIndices[texture_start + v2f.worldIdx % texture_count]; } if (texture_idx != -1) { From 89fd6681df13fb5cbe68b0436362d1f6b68328d2 Mon Sep 17 00:00:00 2001 From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com> Date: Thu, 4 Dec 2025 07:03:43 -0500 Subject: [PATCH 08/11] fix(7) other validation --- src/render/batch_renderer.cpp | 11 ++++++++--- src/render/shaders/batch_draw_rgb.hlsl | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/render/batch_renderer.cpp b/src/render/batch_renderer.cpp index 28aa7724..ecbf6bd3 100644 --- a/src/render/batch_renderer.cpp +++ b/src/render/batch_renderer.cpp @@ -1054,8 +1054,8 @@ static void issueComputeLayoutTransitions( return VkImageMemoryBarrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, - .srcAccessMask = isDepth ? VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT - : VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .srcAccessMask = isDepth ? VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT + : VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, .oldLayout = isDepth ? VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, @@ -1704,6 +1704,11 @@ BatchRenderer::~BatchRenderer() impl->dev.dt.destroyPipeline(impl->dev.hdl, impl->shadowDraw.hdls[0], nullptr); impl->dev.dt.destroyPipelineLayout(impl->dev.hdl, impl->shadowDraw.layout, nullptr); + if (impl->postProcess.has_value()) { + impl->dev.dt.destroyPipeline(impl->dev.hdl, impl->postProcess->hdls[0], nullptr); + impl->dev.dt.destroyPipelineLayout(impl->dev.hdl, impl->postProcess->layout, nullptr); + } + for (CountT i = 0; i < impl->batchFrames.size(); i++) { impl->dev.dt.destroyCommandPool(impl->dev.hdl, impl->batchFrames[i].prepareCmdPool, nullptr); impl->dev.dt.destroyCommandPool(impl->dev.hdl, impl->batchFrames[i].renderCmdPool, nullptr); @@ -1721,7 +1726,7 @@ BatchRenderer::~BatchRenderer() impl->dev.dt.destroyImageView(impl->dev.hdl, impl->batchFrames[i].targets[j].shadowMapView, nullptr); impl->dev.dt.destroyImageView(impl->dev.hdl, impl->batchFrames[i].targets[j].shadowDepthView, nullptr); } - } + } impl->dev.dt.destroyQueryPool(impl->dev.hdl, impl->timeQueryPool, nullptr); } diff --git a/src/render/shaders/batch_draw_rgb.hlsl b/src/render/shaders/batch_draw_rgb.hlsl index 554bd40f..43929f47 100644 --- a/src/render/shaders/batch_draw_rgb.hlsl +++ b/src/render/shaders/batch_draw_rgb.hlsl @@ -113,7 +113,7 @@ void vert(in uint vid : SV_VertexID, v2f.worldPos = rotateVec(instance_data.rotation, instance_data.scale * vert.position) + instance_data.position; v2f.position = clip_pos; - v2f.uv = float2(vert.uv.x, 1.0f - vert.uv.y); + v2f.uv = vert.uv; v2f.worldNormal = rotateVec(instance_data.rotation, vert.normal); v2f.worldIdx = instance_data.worldID; v2f.viewIdx = draw_data.viewID; From a2bf345c107fe21b5fa953add31a054f7f252903 Mon Sep 17 00:00:00 2001 From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com> Date: Fri, 5 Dec 2025 02:16:00 -0500 Subject: [PATCH 09/11] update batch raytrace --- include/madrona/mesh_bvh.hpp | 3 +- .../render/cuda_batch_render_assets.hpp | 5 +- src/bridge/mgr.cpp | 29 ++-- src/mw/device/bvh_raycast.cpp | 95 ++++-------- src/mw/device/ecs_render_bridge.cpp | 58 ++----- src/mw/device/include/madrona/bvh.hpp | 1 + src/render/asset_processor.cpp | 142 ++++++++++-------- 7 files changed, 147 insertions(+), 186 deletions(-) diff --git a/include/madrona/mesh_bvh.hpp b/include/madrona/mesh_bvh.hpp index 43c51ff4..288faed7 100644 --- a/include/madrona/mesh_bvh.hpp +++ b/include/madrona/mesh_bvh.hpp @@ -149,7 +149,8 @@ struct Material { // For now, just a color math::Vector4 color; - int32_t textureIdx; + int32_t textureOffset; + int32_t numTextures; float roughness; float metalness; diff --git a/include/madrona/render/cuda_batch_render_assets.hpp b/include/madrona/render/cuda_batch_render_assets.hpp index 42750006..8fd7a514 100644 --- a/include/madrona/render/cuda_batch_render_assets.hpp +++ b/include/madrona/render/cuda_batch_render_assets.hpp @@ -22,10 +22,11 @@ struct MeshBVHData { struct MaterialData { // GPU buffer containing array of texture objects cudaTextureObject_t *textures; - uint32_t numTextureBuffers; cudaArray_t *textureBuffers; - uint32_t numMipmapTextureBuffers; + uint32_t numTextureBuffers; cudaMipmappedArray_t *mipmapTextureBuffers; + uint32_t numMipmapTextureBuffers; + int32_t *materialTextures; Material *materials; }; diff --git a/src/bridge/mgr.cpp b/src/bridge/mgr.cpp index 7cfcca0a..58718ca0 100644 --- a/src/bridge/mgr.cpp +++ b/src/bridge/mgr.cpp @@ -489,24 +489,25 @@ static RTAssets loadRenderObjects( }; } - SourceTexture *out_textures = tmp_alloc.allocN(model.numTextures); + // Use HeapArray with StackAlloc for consistency and better performance + // StackAlloc provides efficient temporary allocations that are automatically cleaned up + HeapArray textures(model.numTextures, tmp_alloc); for (CountT i = 0; i < model.numTextures; i++) { // TODO: NChans is not used. uint64_t tex_offset = model.texOffsets[i]; - Optional tex = SourceTexture { + textures[i] = SourceTexture { .data = model.texData + tex_offset, .format = SourceTextureFormat::R8G8B8A8, .width = (uint32_t)model.texWidths[i], .height = (uint32_t)model.texHeights[i], .numBytes = (size_t)(model.texWidths[i] * model.texHeights[i] * 4), }; - out_textures[i] = *tex; } - Span imported_textures = Span(out_textures, model.numTextures); - - std::vector materials; + // Use HeapArray instead of std::vector since size is known upfront + // This avoids dynamic reallocation and provides better performance + HeapArray materials(model.numMats, tmp_alloc); for (CountT i = 0; i < model.numMats; i++) { const math::Vector4 &rgba = model.matRGBA[i]; uint32_t mat_tex_offset = model.matTexOffsets[i]; @@ -514,14 +515,13 @@ static RTAssets loadRenderObjects( model.matTexOffsets[i + 1] : model.numMatTextures; uint32_t mat_tex_num = next_tex_offset - mat_tex_offset; - SourceMaterial mat = { + materials[i] = SourceMaterial { .color = math::Vector4{rgba.x, rgba.y, rgba.z, rgba.w}, .textureIdx = model.matTexIDs + mat_tex_offset, .numTextures = mat_tex_num, .roughness = 0.0f, .metalness = 0.0f }; - materials.push_back(mat); } // Create materials for geoms that do not have one assigned @@ -629,16 +629,19 @@ static RTAssets loadRenderObjects( } if (render_mgr.has_value()) { - render_mgr->loadObjects(objs, materials, imported_textures); + // HeapArray automatically converts to Span via template constructor + render_mgr->loadObjects(objs, materials, textures); } if (use_rt) { auto ret = RTAssets { render::AssetProcessor::makeBVHData(objs), - render::AssetProcessor::initMaterialData(materials.data(), - materials.size(), - imported_textures.data(), - imported_textures.size()) + render::AssetProcessor::initMaterialData( + materials.data(), + materials.size(), + textures.data(), + textures.size() + ) }; return ret; diff --git a/src/mw/device/bvh_raycast.cpp b/src/mw/device/bvh_raycast.cpp index 64e61df8..d09da6c9 100644 --- a/src/mw/device/bvh_raycast.cpp +++ b/src/mw/device/bvh_raycast.cpp @@ -602,22 +602,17 @@ static __device__ TraceResult traceRay( // Intersect with the children of the child to get a new node group // and calculate the present bits according to which were // intersected - uint32_t child_node_idx = - node_buffer[current_grp & 0xFFFF'FFFF].childrenIdx[child_idx]; + uint32_t child_node_idx = node_buffer[current_grp & 0xFFFF'FFFF].childrenIdx[child_idx]; bool child_is_leaf = (child_node_idx & 0x8000'0000); child_node_idx = child_node_idx & (~0x8000'0000); GroupType new_grp_type = GroupType::TopLevel; - if (parent_grp_type == GroupType::TopLevel && - child_is_leaf) { + if (parent_grp_type == GroupType::TopLevel && child_is_leaf) { // Need to compute new ray o/d/etc... instance_idx = (int32_t)(child_node_idx & ~0x8000'0000); - InstanceData *instance_data = world_info.instances + - instance_idx; - - current_bvh = bvhParams.bvhs + - world_info.instances[instance_idx].objectID; + InstanceData *instance_data = world_info.instances + instance_idx; + current_bvh = bvhParams.bvhs + world_info.instances[instance_idx].objectID; // Should be able to just do a continue in this case - we'll // just resume processing the parent node @@ -625,11 +620,9 @@ static __device__ TraceResult traceRay( continue; ray_o = instance_data->scale.inv() * - instance_data->rotation.inv().rotateVec( - (ray_o - instance_data->position)); + instance_data->rotation.inv().rotateVec(ray_o - instance_data->position); ray_d = instance_data->scale.inv() * - instance_data->rotation.inv().rotateVec( - ray_d); + instance_data->rotation.inv().rotateVec(ray_d); t_scale = ray_d.length(); t_max *= t_scale; @@ -682,12 +675,10 @@ static __device__ TraceResult traceRay( } } - current_grp = encodeNodeGroup( - child_node_idx, grp_present_bits, new_grp_type); + current_grp = encodeNodeGroup(child_node_idx, grp_present_bits, new_grp_type); if (tri_present_bits) { - triangle_grp = encodeNodeGroup( - child_node_idx, tri_present_bits, GroupType::Triangles); + triangle_grp = encodeNodeGroup(child_node_idx, tri_present_bits, GroupType::Triangles); } else { triangle_grp = invalidNodeGroup(); } @@ -701,28 +692,16 @@ static __device__ TraceResult traceRay( while (getTrianglePresentBits(triangle_grp) != 0) { // TODO: check active mask against heuristic to exit if not enough // threads are working on this - uint32_t local_node_tri_idx = - __ffs(getTrianglePresentBits(triangle_grp)) - 1; - - uint32_t local_leaf_idx = - local_node_tri_idx / MeshBVH::numTrisPerLeaf; - uint32_t tri_idx = - local_node_tri_idx % MeshBVH::numTrisPerLeaf; - - uint32_t glob_leaf_idx = - parent.childrenIdx[local_leaf_idx] & (~0x8000'0000); + uint32_t local_node_tri_idx = __ffs(getTrianglePresentBits(triangle_grp)) - 1; + uint32_t local_leaf_idx = local_node_tri_idx / MeshBVH::numTrisPerLeaf; + uint32_t tri_idx = local_node_tri_idx % MeshBVH::numTrisPerLeaf; + uint32_t glob_leaf_idx = parent.childrenIdx[local_leaf_idx] & (~0x8000'0000); TriHitInfo hit_info = triangleIntersect( - glob_leaf_idx, - tri_idx, - isect_info, - ray_o, - t_max, - current_bvh); + glob_leaf_idx, tri_idx, isect_info, ray_o, t_max, current_bvh); if (hit_info.hit) { t_max = hit_info.tHit; - tri_hit = hit_info; tri_hit.instanceIdx = instance_idx; } @@ -768,8 +747,7 @@ static __device__ TraceResult traceRay( int32_t material_idx = override_mat_id; if (override_mat_id == MaterialOverride::UseDefaultMaterial) { - material_idx = tri_hit.bvh->getMaterialIDX( - tri_hit.leafMaterialIndex); + material_idx = tri_hit.bvh->getMaterialIDX(tri_hit.leafMaterialIndex); } Vector3 color = { 1.f, 1.f, 1.f }; @@ -778,9 +756,9 @@ static __device__ TraceResult traceRay( color = hexToRgb(instance->color); } else if (material_idx != -1) { Material *mat = &bvhParams.materials[material_idx]; - - if (mat->textureIdx != -1) { - cudaTextureObject_t *tex = &bvhParams.textures[mat->textureIdx]; + if (mat->numTextures > 0) { + int32_t texture_idx = bvhParams.materialTextures[mat->textureOffset + instance->worldIDX % mat->numTextures]; + cudaTextureObject_t *tex = bvhParams.textures + texture_idx; // --- Mipmap LOD selection --- // Estimate LOD based on distance from camera and UV footprint @@ -799,12 +777,13 @@ static __device__ TraceResult traceRay( // Clamp LOD to [0, 8] lod = fminf(fmaxf(lod, 0.0f), 8.0f); - float4 sampled_color = tex2DLod(*tex, - tri_hit.uv.x, tri_hit.uv.y, lod); + float4 sampled_color = tex2DLod(*tex, tri_hit.uv.x, tri_hit.uv.y, lod); - Vector3 tex_color = { sampled_color.x, + Vector3 tex_color = { + sampled_color.x, sampled_color.y, - sampled_color.z }; + sampled_color.z + }; color.x = tex_color.x * mat->color.x; color.y = tex_color.y * mat->color.y; @@ -942,13 +921,13 @@ static __device__ FragmentResult computeFragment( // TODO: Definitely do some sort of ray fetching because there will // be threads doing nothing potentially. TraceResult shadow_hit = traceRay( - TraceInfo { + TraceInfo { .rayOrigin = hit_pos, .rayDirection = -ray_dir, .tMin = 0.000001f, .tMax = 10000.f, .dOnly = true - }, world_info); + }, world_info); if(shadow_hit.hit) { continue; } @@ -1004,29 +983,20 @@ extern "C" __global__ void bvhRaycastEntry() const uint32_t resident_view_offset = blockIdx.x; uint32_t current_view_offset = resident_view_offset; - - uint32_t bytes_per_view = - bvhParams.renderOutputWidth * bvhParams.renderOutputHeight * 4; - + uint32_t bytes_per_view = bvhParams.renderOutputWidth * bvhParams.renderOutputHeight * 4; uint32_t num_processed_pixels = 0; - uint32_t pixel_x = blockIdx.y * pixels_per_block + threadIdx.x; uint32_t pixel_y = blockIdx.z * pixels_per_block + threadIdx.y; while (current_view_offset < total_num_views) { // While we still have views to generate, trace. - PerspectiveCameraData *view_data = - &bvhParams.views[current_view_offset]; - + PerspectiveCameraData *view_data = &bvhParams.views[current_view_offset]; uint32_t world_idx = (uint32_t)view_data->worldIDX; - Vector3 ray_start = view_data->position; Vector3 ray_dir = calculateOutRay(view_data, pixel_x, pixel_y); uint32_t internal_nodes_offset = bvhParams.instanceOffsets[world_idx]; - - // This does both the tracing / lighting, etc... just like a fragment // shader does in GLSL. FragmentResult result = computeFragment( @@ -1038,22 +1008,15 @@ extern "C" __global__ void bvhRaycastEntry() .dOnly = false }, TraceWorldInfo { - .nodes = bvhParams.internalData->traversalNodes + - internal_nodes_offset, + .nodes = bvhParams.internalData->traversalNodes + internal_nodes_offset, .instances = bvhParams.instances + internal_nodes_offset, .lights = &bvhParams.lights[bvhParams.lightOffsets[world_idx]], .numLights = (uint32_t)bvhParams.lightCounts[world_idx] } ); - - - - uint32_t linear_pixel_idx = 4 * - (pixel_x + pixel_y * bvhParams.renderOutputWidth); - - uint32_t global_pixel_byte_off = current_view_offset * bytes_per_view + - linear_pixel_idx; + uint32_t linear_pixel_idx = 4 * (pixel_x + pixel_y * bvhParams.renderOutputWidth); + uint32_t global_pixel_byte_off = current_view_offset * bytes_per_view + linear_pixel_idx; if (bvhParams.raycastRGBD) { // Write both depth and color information diff --git a/src/mw/device/ecs_render_bridge.cpp b/src/mw/device/ecs_render_bridge.cpp index e41de39b..dba21fd4 100644 --- a/src/mw/device/ecs_render_bridge.cpp +++ b/src/mw/device/ecs_render_bridge.cpp @@ -10,6 +10,7 @@ extern "C" __global__ void initBVHParams(madrona::BVHParams *params, void *timings, void *materials, void *textures, + void *material_textures, uint32_t num_sms, uint32_t sm_shared_memory) { @@ -20,44 +21,22 @@ extern "C" __global__ void initBVHParams(madrona::BVHParams *params, StateManager *mgr = mwGPU::getStateManager(); mwGPU::HostAllocator *host_alloc = mwGPU::getHostAllocator(); mwGPU::TmpAllocator *tmp_alloc = &mwGPU::TmpAllocator::get(); - mwGPU::HostPrint *host_print = - (mwGPU::HostPrint *)mwGPU::GPUImplConsts::get().hostPrintAddr; + mwGPU::HostPrint *host_print = (mwGPU::HostPrint *)mwGPU::GPUImplConsts::get().hostPrintAddr; uint32_t raycast_rgbd = mwGPU::GPUImplConsts::get().raycastRGBD; params->numWorlds = num_worlds; - params->instances = mgr->getArchetypeComponent< - RenderableArchetype, InstanceData>(); - - params->views = mgr->getArchetypeComponent< - RenderCameraArchetype, PerspectiveCameraData>(); - - params->lights = mgr->getArchetypeComponent< - LightArchetype, LightDesc>(); - - params->instanceOffsets = (int32_t *)mgr->getArchetypeWorldOffsets< - RenderableArchetype>(); - - params->instanceCounts = (int32_t *)mgr->getArchetypeWorldCounts< - RenderableArchetype>(); - - params->aabbs = (TLBVHNode *)mgr->getArchetypeComponent< - RenderableArchetype, TLBVHNode>(); - - params->viewOffsets = (int32_t *)mgr->getArchetypeWorldOffsets< - RenderCameraArchetype>(); - - params->viewCounts = (int32_t *)mgr->getArchetypeWorldCounts< - RenderCameraArchetype>(); - - params->lightOffsets = (int32_t *)mgr->getArchetypeWorldOffsets< - LightArchetype>(); - - params->lightCounts = (int32_t *)mgr->getArchetypeWorldCounts< - LightArchetype>(); - - params->mortonCodes = (uint32_t *)mgr->getArchetypeComponent< - RenderableArchetype, MortonCode>(); + params->instances = mgr->getArchetypeComponent(); + params->views = mgr->getArchetypeComponent(); + params->lights = mgr->getArchetypeComponent(); + params->instanceOffsets = (int32_t *)mgr->getArchetypeWorldOffsets(); + params->instanceCounts = (int32_t *)mgr->getArchetypeWorldCounts(); + params->aabbs = (TLBVHNode *)mgr->getArchetypeComponent(); + params->viewOffsets = (int32_t *)mgr->getArchetypeWorldOffsets(); + params->viewCounts = (int32_t *)mgr->getArchetypeWorldCounts(); + params->lightOffsets = (int32_t *)mgr->getArchetypeWorldOffsets(); + params->lightCounts = (int32_t *)mgr->getArchetypeWorldCounts(); + params->mortonCodes = (uint32_t *)mgr->getArchetypeComponent(); params->bvhs = (MeshBVH *)bvhs; @@ -65,20 +44,15 @@ extern "C" __global__ void initBVHParams(madrona::BVHParams *params, params->rgbOutput = (void *)mgr->getArchetypeComponent< RaycastOutputArchetype, render::RGBOutputBuffer>(); - params->depthOutput = (void *)mgr->getArchetypeComponent< RaycastOutputArchetype, render::DepthOutputBuffer>(); - params->normalOutput = (void *)mgr->getArchetypeComponent< RaycastOutputArchetype, render::NormalOutputBuffer>(); - params->segmentationOutput = (void *)mgr->getArchetypeComponent< RaycastOutputArchetype, render::SegmentationOutputBuffer>(); - params->renderOutputWidth = - mwGPU::GPUImplConsts::get().raycastOutputWidth; - params->renderOutputHeight = - mwGPU::GPUImplConsts::get().raycastOutputHeight; + params->renderOutputWidth = mwGPU::GPUImplConsts::get().raycastOutputWidth; + params->renderOutputHeight = mwGPU::GPUImplConsts::get().raycastOutputHeight; params->internalData = (BVHInternalData *)internal_data; @@ -87,8 +61,8 @@ extern "C" __global__ void initBVHParams(madrona::BVHParams *params, params->hostPrintAddr = (void *)host_print; params->materials = (Material *)materials; - params->textures = (cudaTextureObject_t *)textures; + params->materialTextures = (int32_t *)material_textures; params->raycastRGBD = raycast_rgbd; diff --git a/src/mw/device/include/madrona/bvh.hpp b/src/mw/device/include/madrona/bvh.hpp index 19a36b71..6890e7b4 100644 --- a/src/mw/device/include/madrona/bvh.hpp +++ b/src/mw/device/include/madrona/bvh.hpp @@ -153,6 +153,7 @@ struct BVHParams { Material *materials; cudaTextureObject_t *textures; + int32_t *materialTextures; // Used to determine how many thread blocks per SM. uint32_t numSMs; diff --git a/src/render/asset_processor.cpp b/src/render/asset_processor.cpp index a2206474..d1f62ee3 100644 --- a/src/render/asset_processor.cpp +++ b/src/render/asset_processor.cpp @@ -318,16 +318,23 @@ MaterialData initMaterialData( uint32_t num_textures) { // TODO: Only generate mipmaps for RGBA textures - // Count number of BC7 textures and number of non-BC7 textures + // Count textures by type: + // - BC7 (compressed): uses non-mipmapped cudaArray_t → textureBuffers + // - RGBA (uncompressed): uses mipmapped cudaMipmappedArray_t → mipmapTextureBuffers uint32_t num_non_mipmap_textures = 0; uint32_t num_mipmap_textures = 0; for (uint32_t i = 0; i < num_textures; ++i) { if (textures[i].format == imp::SourceTextureFormat::BC7) { - num_mipmap_textures++; - } else { num_non_mipmap_textures++; + } else { + num_mipmap_textures++; } } + + uint32_t num_material_textures = 0; + for (uint32_t i = 0; i < num_materials; ++i) { + num_material_textures += materials[i].numTextures; + } MaterialData cpu_mat_data = { .textures = (cudaTextureObject_t *) malloc(sizeof(cudaTextureObject_t) * num_textures), @@ -335,65 +342,73 @@ MaterialData initMaterialData( .numTextureBuffers = num_non_mipmap_textures, .mipmapTextureBuffers = (cudaMipmappedArray_t *) malloc(sizeof(cudaMipmappedArray_t) * num_mipmap_textures), .numMipmapTextureBuffers = num_mipmap_textures, + .materialTextures = (int32_t *) malloc(sizeof(int32_t) * num_material_textures), .materials = (Material *) malloc(sizeof(Material) * num_materials) }; + // Separate indices for texture buffer arrays (they have different sizes) + uint32_t non_mipmap_idx = 0; + uint32_t mipmap_idx = 0; + + // Helper function to create texture descriptor (reduces duplication) + auto createTextureDesc = [](bool use_mipmap, uint32_t max_mip_level = 0) { + cudaTextureDesc tex_desc = {}; + tex_desc.addressMode[0] = cudaAddressModeWrap; + tex_desc.addressMode[1] = cudaAddressModeWrap; + tex_desc.filterMode = cudaFilterModeLinear; + tex_desc.readMode = cudaReadModeNormalizedFloat; + tex_desc.normalizedCoords = 1; + tex_desc.sRGB = 1; + if (use_mipmap) { + tex_desc.mipmapFilterMode = cudaFilterModeLinear; + tex_desc.maxMipmapLevelClamp = max_mip_level; + } + return tex_desc; + }; + for (uint32_t i = 0; i < num_textures; ++i) { const auto &tex = textures[i]; - int width, height; - void *pixels = nullptr; if (tex.format == imp::SourceTextureFormat::BC7) { - width = tex.width; - height = tex.height; - - cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); + // BC7 compressed textures: use non-mipmapped array + cudaChannelFormatDesc channel_desc = + cudaCreateChannelDesc(); cudaArray_t cuda_array; - REQ_CUDA(cudaMallocArray(&cuda_array, &channel_desc, width, height, cudaArrayDefault)); + REQ_CUDA(cudaMallocArray(&cuda_array, &channel_desc, + tex.width, tex.height, cudaArrayDefault)); REQ_CUDA(cudaMemcpy2DToArray(cuda_array, 0, 0, tex.data, - 16 * width / 4, - 16 * width / 4, - height / 4, + 16 * tex.width / 4, + 16 * tex.width / 4, + tex.height / 4, cudaMemcpyHostToDevice)); cudaResourceDesc res_desc = {}; res_desc.resType = cudaResourceTypeArray; res_desc.res.array.array = cuda_array; - cudaTextureDesc tex_desc = {}; - tex_desc.addressMode[0] = cudaAddressModeWrap; - tex_desc.addressMode[1] = cudaAddressModeWrap; - tex_desc.filterMode = cudaFilterModeLinear; - tex_desc.readMode = cudaReadModeNormalizedFloat; - tex_desc.normalizedCoords = 1; - tex_desc.sRGB = 1; - + cudaTextureDesc tex_desc = createTextureDesc(false); cudaTextureObject_t tex_obj = 0; REQ_CUDA(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, nullptr)); cpu_mat_data.textures[i] = tex_obj; - cpu_mat_data.textureBuffers[i] = cuda_array; + cpu_mat_data.textureBuffers[non_mipmap_idx++] = cuda_array; } else { - pixels = tex.data; - width = tex.width; - height = tex.height; - // TODO: Only generate mipmaps for RGBA textures - // Generate mipmaps - const uint32_t MAX_MIPS = 16; // Should be enough for any reasonable texture size - void* mip_data[MAX_MIPS]; - memset(mip_data, 0, sizeof(mip_data)); + // RGBA uncompressed textures: generate mipmaps + const uint32_t MAX_MIPS = 16; + void* mip_data[MAX_MIPS] = {}; uint32_t mip_widths[MAX_MIPS]; uint32_t mip_heights[MAX_MIPS]; uint32_t num_mips; - generateMipmaps(pixels, width, height, mip_data, mip_widths, mip_heights, num_mips); + generateMipmaps(tex.data, tex.width, tex.height, + mip_data, mip_widths, mip_heights, num_mips); - // Create CUDA array with mipmaps + // Create CUDA mipmapped array cudaMipmappedArray_t mipArray; cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - cudaExtent extent = make_cudaExtent(width, height, 0); + cudaExtent extent = make_cudaExtent(tex.width, tex.height, 0); REQ_CUDA(cudaMallocMipmappedArray(&mipArray, &channelDesc, extent, num_mips)); // Copy each mip level @@ -418,56 +433,59 @@ MaterialData initMaterialData( res_desc.resType = cudaResourceTypeMipmappedArray; res_desc.res.mipmap.mipmap = mipArray; - cudaTextureDesc tex_desc = {}; - tex_desc.addressMode[0] = cudaAddressModeWrap; - tex_desc.addressMode[1] = cudaAddressModeWrap; - tex_desc.filterMode = cudaFilterModeLinear; - tex_desc.readMode = cudaReadModeNormalizedFloat; - tex_desc.normalizedCoords = 1; - tex_desc.sRGB = 1; - tex_desc.mipmapFilterMode = cudaFilterModeLinear; - tex_desc.maxMipmapLevelClamp = num_mips - 1; - + cudaTextureDesc tex_desc = createTextureDesc(true, num_mips - 1); cudaTextureObject_t tex_obj = 0; - REQ_CUDA(cudaCreateTextureObject(&tex_obj, - &res_desc, &tex_desc, nullptr)); - + REQ_CUDA(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, nullptr)); cpu_mat_data.textures[i] = tex_obj; - cpu_mat_data.mipmapTextureBuffers[i] = mipArray; + cpu_mat_data.mipmapTextureBuffers[mipmap_idx++] = mipArray; } } + // Populate materials array + int32_t material_texture_offset = 0; for (uint32_t i = 0; i < num_materials; ++i) { - Material mat = { - .color = materials[i].color, - .textureIdx = materials[i].numTextures > 0 ? *(materials[i].textureIdx) : -1, //TODO: support batch - .roughness = materials[i].roughness, - .metalness = materials[i].metalness, + const SourceMaterial &src_mat = materials[i]; + cpu_mat_data.materials[i] = Material { + .color = src_mat.color, + .textureOffset = material_texture_offset, + .numTextures = src_mat.numTextures, + .roughness = src_mat.roughness, + .metalness = src_mat.metalness, }; - - cpu_mat_data.materials[i] = mat; + material_texture_offset += src_mat.numTextures; } + // Copy texture objects and materials to GPU cudaTextureObject_t *gpu_tex_buffer; REQ_CUDA(cudaMalloc(&gpu_tex_buffer, sizeof(cudaTextureObject_t) * num_textures)); REQ_CUDA(cudaMemcpy(gpu_tex_buffer, cpu_mat_data.textures, sizeof(cudaTextureObject_t) * num_textures, cudaMemcpyHostToDevice)); - Material *mat_buffer; - REQ_CUDA(cudaMalloc(&mat_buffer, sizeof(Material) * num_materials)); - REQ_CUDA(cudaMemcpy(mat_buffer, cpu_mat_data.materials, + Material *gpu_mat_buffer; + REQ_CUDA(cudaMalloc(&gpu_mat_buffer, sizeof(Material) * num_materials)); + REQ_CUDA(cudaMemcpy(gpu_mat_buffer, cpu_mat_data.materials, sizeof(Material) * num_materials, cudaMemcpyHostToDevice)); + int32_t *gpu_mat_texs; + REQ_CUDA(cudaMalloc(&gpu_mat_texs, sizeof(int32_t) * num_material_textures)); + REQ_CUDA(cudaMemcpy(gpu_mat_texs, cpu_mat_data.materialTextures, + sizeof(int32_t) * num_material_textures, + cudaMemcpyHostToDevice)); + + // Free CPU-side temporary buffers free(cpu_mat_data.textures); free(cpu_mat_data.materials); - free(cpu_mat_data.textureBuffers); - free(cpu_mat_data.mipmapTextureBuffers); - - auto gpu_mat_data = cpu_mat_data; + free(cpu_mat_data.materialTextures); + + // Note: textureBuffers and mipmapTextureBuffers are CUDA arrays that remain on GPU + // They are not freed here as they're part of the returned MaterialData + // Return GPU-side MaterialData + MaterialData gpu_mat_data = cpu_mat_data; gpu_mat_data.textures = gpu_tex_buffer; - gpu_mat_data.materials = mat_buffer; + gpu_mat_data.materials = gpu_mat_buffer; + gpu_mat_data.materialTextures = gpu_mat_texs; return gpu_mat_data; } From c6f4a007b106710e2260902301906dbb61c2e37a Mon Sep 17 00:00:00 2001 From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com> Date: Fri, 5 Dec 2025 03:37:07 -0500 Subject: [PATCH 10/11] fix(1) --- include/madrona/mesh_bvh.hpp | 2 +- src/bridge/mgr.cpp | 11 ++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/madrona/mesh_bvh.hpp b/include/madrona/mesh_bvh.hpp index 288faed7..981ccee8 100644 --- a/include/madrona/mesh_bvh.hpp +++ b/include/madrona/mesh_bvh.hpp @@ -150,7 +150,7 @@ struct Material { math::Vector4 color; int32_t textureOffset; - int32_t numTextures; + uint32_t numTextures; float roughness; float metalness; diff --git a/src/bridge/mgr.cpp b/src/bridge/mgr.cpp index 58718ca0..d2a40be8 100644 --- a/src/bridge/mgr.cpp +++ b/src/bridge/mgr.cpp @@ -401,8 +401,6 @@ static RTAssets loadRenderObjects( Optional &render_mgr, bool use_rt) { - StackAlloc tmp_alloc; - std::array render_asset_paths; const char *py_root_env = getenv("MADRONA_ROOT_PATH"); std::filesystem::path data_dir = py_root_env ? (std::string(py_root_env) + "/data") : DATA_DIR; @@ -489,9 +487,8 @@ static RTAssets loadRenderObjects( }; } - // Use HeapArray with StackAlloc for consistency and better performance - // StackAlloc provides efficient temporary allocations that are automatically cleaned up - HeapArray textures(model.numTextures, tmp_alloc); + // Use HeapArray with default allocator - automatically cleaned up when it goes out of scope + HeapArray textures(model.numTextures); for (CountT i = 0; i < model.numTextures; i++) { // TODO: NChans is not used. @@ -505,9 +502,9 @@ static RTAssets loadRenderObjects( }; } - // Use HeapArray instead of std::vector since size is known upfront + // Use HeapArray with default allocator - automatically cleaned up when it goes out of scope // This avoids dynamic reallocation and provides better performance - HeapArray materials(model.numMats, tmp_alloc); + HeapArray materials(model.numMats); for (CountT i = 0; i < model.numMats; i++) { const math::Vector4 &rgba = model.matRGBA[i]; uint32_t mat_tex_offset = model.matTexOffsets[i]; From 0e553183c9013bd1fdca16dc29257f00caeb3f01 Mon Sep 17 00:00:00 2001 From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com> Date: Fri, 5 Dec 2025 05:34:13 -0500 Subject: [PATCH 11/11] fix(2) --- src/mw/cuda_exec.cpp | 91 +++++++++++----------------------- src/render/asset_processor.cpp | 9 +++- 2 files changed, 37 insertions(+), 63 deletions(-) diff --git a/src/mw/cuda_exec.cpp b/src/mw/cuda_exec.cpp index 193babea..0401db5e 100644 --- a/src/mw/cuda_exec.cpp +++ b/src/mw/cuda_exec.cpp @@ -1295,40 +1295,31 @@ static BVHKernels buildBVHKernels(const CompileConfig &cfg, } CUfunction bvh_build_fast; - REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_build_fast, mod, - "bvhBuildFast")); + REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_build_fast, mod, "bvhBuildFast")); CUfunction bvh_build_slow; - REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_build_slow, mod, - "bvhBuildSlow")); + REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_build_slow, mod, "bvhBuildSlow")); CUfunction bvh_init; - REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_init, mod, - "bvhInit")); + REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_init, mod, "bvhInit")); CUfunction bvh_alloc; - REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_alloc, mod, - "bvhAllocInternalNodes")); + REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_alloc, mod, "bvhAllocInternalNodes")); CUfunction bvh_aabbs; - REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_aabbs, mod, - "bvhConstructAABBs")); + REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_aabbs, mod, "bvhConstructAABBs")); CUfunction widen_tree; - REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&widen_tree, mod, - "bvhWidenTree")); + REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&widen_tree, mod, "bvhWidenTree")); CUfunction bvh_opt; - REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_opt, mod, - "bvhOptimizeLBVH")); + REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_opt, mod, "bvhOptimizeLBVH")); CUfunction bvh_debug; - REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_debug, mod, - "bvhDebug")); + REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_debug, mod, "bvhDebug")); CUfunction bvh_raycast_entry; - REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_raycast_entry, mod, - "bvhRaycastEntry")); + REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_raycast_entry, mod, "bvhRaycastEntry")); CUevent alloc_event; CUevent build_event; @@ -1364,8 +1355,7 @@ static BVHKernels buildBVHKernels(const CompileConfig &cfg, .stopEvent = stop_event, .recordedTimings = {}, .timingInfo = timing_info, - .raycastRGBD = (uint32_t)(render_mode == - CudaBatchRenderConfig::RenderMode::RGBD), + .raycastRGBD = (uint32_t)(render_mode == CudaBatchRenderConfig::RenderMode::RGBD), .meshBVHData = {}, .materialData = {}, }; @@ -1417,13 +1407,9 @@ static GPUKernels buildKernels(const CompileConfig &cfg, sizeof(const char *) * cfg.userSources.size()); // Build architecture string for this GPU - string gpu_arch_str = "sm_" + to_string(cuda_arch.first) + - to_string(cuda_arch.second); - + string gpu_arch_str = "sm_" + to_string(cuda_arch.first) + to_string(cuda_arch.second); string gpu_arch_flag = std::string("-arch=") + gpu_arch_str; - - string num_sms_str = - "-DMADRONA_MWGPU_NUM_SMS=(" + to_string(num_sms) + "_i32)"; + string num_sms_str = "-DMADRONA_MWGPU_NUM_SMS=(" + to_string(num_sms) + "_i32)"; CountT max_megakernel_blocks_per_sm = 1; for (const MegakernelConfig &megakernel_cfg : megakernel_cfgs) { @@ -1821,8 +1807,7 @@ static GPUEngineState initEngineAndUserState( 0, strm, nullptr, args.data())); }; - uint64_t num_init_bytes = - (uint64_t)num_world_init_bytes * (uint64_t)num_worlds; + uint64_t num_init_bytes = (uint64_t)num_world_init_bytes * (uint64_t)num_worlds; auto init_tmp_buffer = cu::allocGPU(num_init_bytes); REQ_CUDA(cudaMemcpyAsync(init_tmp_buffer, world_init_ptr, num_init_bytes, cudaMemcpyHostToDevice, strm)); @@ -1831,14 +1816,9 @@ static GPUEngineState initEngineAndUserState( REQ_CUDA(cudaMemcpyAsync(user_cfg_gpu_buffer, user_cfg_host_ptr, num_user_cfg_bytes, cudaMemcpyHostToDevice, strm)); - auto gpu_consts_readback = (GPUImplConsts *)cu::allocReadback( - sizeof(GPUImplConsts)); - - auto gpu_state_size_readback = (size_t *)cu::allocReadback( - sizeof(size_t)); - - auto exported_readback = (void **)cu::allocReadback( - sizeof(void *) * num_exported); + auto gpu_consts_readback = (GPUImplConsts *)cu::allocReadback(sizeof(GPUImplConsts)); + auto gpu_state_size_readback = (size_t *)cu::allocReadback(sizeof(size_t)); + auto exported_readback = (void **)cu::allocReadback(sizeof(void *) * num_exported); CUdeviceptr allocator_channel_devptr; REQ_CU(CudaDynamicLoader::cuMemAllocManaged(&allocator_channel_devptr, @@ -1864,16 +1844,14 @@ static GPUEngineState initEngineAndUserState( } HostAllocInit alloc_init { - std::max((uint64_t)sysconf(_SC_PAGESIZE), - (uint64_t)cu_va_alloc_granularity), + std::max((uint64_t)sysconf(_SC_PAGESIZE), (uint64_t)cu_va_alloc_granularity), (uint64_t)cu_va_alloc_granularity, allocator_channel, }; FreeQueue *fq = new FreeQueue {}; - std::thread allocator_thread( - gpuVMAllocatorThread, allocator_channel, cu_ctx, fq); + std::thread allocator_thread(gpuVMAllocatorThread, allocator_channel, cu_ctx, fq); auto host_print = std::make_unique(cu_gpu); @@ -1889,8 +1867,7 @@ static GPUEngineState initEngineAndUserState( raycast_output_width = render_cfg->renderWidth; raycast_output_height = render_cfg->renderHeight; - bvh_internals = cu::allocGPU( - sizeof(mwGPU::madrona::BVHInternalData)); + bvh_internals = cu::allocGPU(sizeof(mwGPU::madrona::BVHInternalData)); } else { bvh_ptrs = nullptr; num_bvhs = 0; @@ -1919,8 +1896,7 @@ static GPUEngineState initEngineAndUserState( exported_readback, user_cfg_gpu_buffer); - auto init_tasks_args = makeKernelArgBuffer( - num_taskgraphs, user_cfg_gpu_buffer); + auto init_tasks_args = makeKernelArgBuffer(num_taskgraphs, user_cfg_gpu_buffer); auto init_worlds_args = makeKernelArgBuffer(num_worlds, user_cfg_gpu_buffer, @@ -1928,8 +1904,7 @@ static GPUEngineState initEngineAndUserState( auto no_args = makeKernelArgBuffer(); - launchKernel(gpu_kernels.computeGPUImplConsts, 1, 1, - compute_consts_args); + launchKernel(gpu_kernels.computeGPUImplConsts, 1, 1, compute_consts_args); REQ_CUDA(cudaStreamSynchronize(strm)); @@ -1984,11 +1959,8 @@ static GPUEngineState initEngineAndUserState( job_sys_consts_size)); if (exec_mode == ExecutorMode::JobSystem) { - launchKernel(gpu_kernels.initWorlds, 1, consts::numMegakernelThreads, - no_args); - - uint32_t num_queue_blocks = - utils::divideRoundUp(num_worlds, consts::numEntryQueueThreads); + launchKernel(gpu_kernels.initWorlds, 1, consts::numMegakernelThreads, no_args); + uint32_t num_queue_blocks = utils::divideRoundUp(num_worlds, consts::numEntryQueueThreads); launchKernel(gpu_kernels.queueUserInit, num_queue_blocks, consts::numEntryQueueThreads, init_worlds_args); @@ -1997,9 +1969,7 @@ static GPUEngineState initEngineAndUserState( consts::numMegakernelThreads, no_args); } else if (exec_mode == ExecutorMode::TaskGraph) { launchKernel(gpu_kernels.initECS, 1, 1, init_ecs_args); - - uint32_t num_init_blocks = - utils::divideRoundUp(num_worlds, consts::numMegakernelThreads); + uint32_t num_init_blocks = utils::divideRoundUp(num_worlds, consts::numMegakernelThreads); launchKernel(gpu_kernels.initWorlds, num_init_blocks, consts::numMegakernelThreads, init_worlds_args); @@ -2012,8 +1982,7 @@ static GPUEngineState initEngineAndUserState( cu::deallocGPU(init_tmp_buffer); HeapArray exported_cols(num_exported); - memcpy(exported_cols.data(), exported_readback, - sizeof(void *) * (uint64_t)num_exported); + memcpy(exported_cols.data(), exported_readback, sizeof(void *) * (uint64_t)num_exported); cu::deallocCPU(exported_readback); @@ -2037,20 +2006,19 @@ static GPUEngineState initEngineAndUserState( bvh_kernels.timingInfo, render_cfg->materialData.materials, render_cfg->materialData.textures, + render_cfg->materialData.materialTextures, (uint32_t)num_sms, (uint32_t)shared_mem_per_sm); // Launch the kernel in the megakernel module to initialize the BVH // params - launchKernel(gpu_kernels.initBVHParams, 1, 1, - init_bvh_args); + launchKernel(gpu_kernels.initBVHParams, 1, 1, init_bvh_args); REQ_CUDA(cudaStreamSynchronize(strm)); // Call the bvh init function from bvh module auto bvh_init_internal_args = makeKernelArgBuffer(alloc_init); - launchKernel(bvh_kernels.init, 1, 1, - no_args); + launchKernel(bvh_kernels.init, 1, 1, no_args); REQ_CUDA(cudaStreamSynchronize(strm)); @@ -2210,8 +2178,7 @@ static DynArray processExecConfigFile( REQ_JSON(kv.value().get(num_blocks)); uint64_t node_idx; - auto res = std::from_chars(key.data(), key.data() + key.size(), - node_idx); + auto res = std::from_chars(key.data(), key.data() + key.size(), node_idx); if (res.ec != std::errc {}) { FATAL("MADRONA_MWGPU_EXEC_CONFIG_FILE points to invalid file"); diff --git a/src/render/asset_processor.cpp b/src/render/asset_processor.cpp index d1f62ee3..0a4b4c7b 100644 --- a/src/render/asset_processor.cpp +++ b/src/render/asset_processor.cpp @@ -14,6 +14,7 @@ #include #include #include +#include using bytes = std::span; @@ -441,7 +442,7 @@ MaterialData initMaterialData( } } - // Populate materials array + // Populate materials and material_textures array int32_t material_texture_offset = 0; for (uint32_t i = 0; i < num_materials; ++i) { const SourceMaterial &src_mat = materials[i]; @@ -452,6 +453,12 @@ MaterialData initMaterialData( .roughness = src_mat.roughness, .metalness = src_mat.metalness, }; + + if (src_mat.textureIdx != nullptr && src_mat.numTextures > 0) { + memcpy(&cpu_mat_data.materialTextures[material_texture_offset], + src_mat.textureIdx, + sizeof(int32_t) * src_mat.numTextures); + } material_texture_offset += src_mat.numTextures; }