From e4001271d76b4a1b7be5d6dddba1f509fb9fb799 Mon Sep 17 00:00:00 2001
From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com>
Date: Wed, 3 Dec 2025 22:17:13 -0500
Subject: [PATCH 01/11] update batch texture

---
 include/madrona/importer.hpp             |   9 +-
 src/bridge/bindings.cpp                  | 120 +++---
 src/bridge/mgr.cpp                       | 102 +++--
 src/bridge/mgr.hpp                       |   3 +-
 src/importer/gltf.cpp                    |  38 +-
 src/render/batch_renderer.cpp            |  82 ++--
 src/render/render_ctx.cpp                | 462 +++++++++--------------
 src/render/render_ctx.hpp                |   1 -
 src/render/shaders/batch_draw_depth.hlsl |  15 +-
 src/render/shaders/batch_draw_rgb.hlsl   | 124 +++---
 src/render/shaders/draw_gbuffer.hlsl     |  10 +-
 src/render/shaders/shader_common.h       |   8 +-
 src/render/shaders/shader_utils.hlsl     |   1 -
 src/render/shaders/viewer_cull.hlsl      |   6 +-
 src/render/shaders/viewer_draw.hlsl      |  81 ++--
 src/render/shaders/voxel_draw.hlsl       |   3 +
 16 files changed, 443 insertions(+), 622 deletions(-)
diff --git a/include/madrona/importer.hpp b/include/madrona/importer.hpp
index 8d9f9265..731035f4 100644
--- a/include/madrona/importer.hpp
+++ b/include/madrona/importer.hpp
@@ -47,8 +47,8 @@ struct SourceMaterial {
     // If this is -1, no texture will be applied. Otherwise,
     // the color gets multipled by color of the texture read in
     // at the UVs of the pixel.
-    int32_t textureIdx;
-
+    int32_t *textureIdx;
+    uint32_t numTextures;
     float roughness;
     float metalness;
 };
@@ -66,8 +66,7 @@ class ImageImporter {
     ImageImporter(ImageImporter &&);
     ~ImageImporter();
 
-    using ImportHandler =
-        Optional<SourceTexture> (*)(void *data, size_t num_bytes);
+    using ImportHandler = Optional<SourceTexture> (*)(void *data, size_t num_bytes);
 
     int32_t addHandler(const char *extension, ImportHandler fn);
 
@@ -106,9 +105,9 @@ struct ImportedAssets {
 
     DynArray<SourceObject> objects;
     DynArray<SourceMaterial> materials;
+    DynArray<uint32_t> materialTextures;
     DynArray<SourceInstance> instances;
     DynArray<SourceTexture> textures;
-
 };
 
 class AssetImporter {
diff --git a/src/bridge/bindings.cpp b/src/bridge/bindings.cpp
index 30e1ce12..e5948391 100644
--- a/src/bridge/bindings.cpp
+++ b/src/bridge/bindings.cpp
@@ -23,59 +23,33 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
         .def("__init__", [](
             Manager *self,
             int64_t gpu_id,
-            nb::ndarray<const float, nb::shape<-1, 3>,
-                nb::device::cpu> mesh_vertices,
-            nb::ndarray<const int32_t, nb::shape<-1, 3>,
-                nb::device::cpu> mesh_faces,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> mesh_vertex_offsets,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> mesh_face_offsets,
-            nb::ndarray<const float, nb::shape<-1, 2>,
-                nb::device::cpu> mesh_texcoords,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> mesh_texcoord_offsets,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> mesh_texcoord_num,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> geom_types,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> geom_groups,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> geom_data_ids,
-            nb::ndarray<const float, nb::shape<-1, 3>,
-                nb::device::cpu> geom_sizes,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> geom_mat_ids,
-            nb::ndarray<const float, nb::shape<-1, 4>,
-                nb::device::cpu> geom_rgba,
-            nb::ndarray<const float, nb::shape<-1, 4>,
-                nb::device::cpu> mat_rgba,
-            nb::ndarray<const int32_t, nb::shape<-1, 10>,
-                nb::device::cpu> mat_tex_ids,
-            nb::ndarray<const uint8_t, nb::shape<-1>,
-                nb::device::cpu> tex_data,
-            nb::ndarray<const int64_t, nb::shape<-1>,
-                nb::device::cpu> tex_offsets,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> tex_widths,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> tex_heights,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> tex_nchans,
+            nb::ndarray<const float, nb::shape<-1, 3>, nb::device::cpu> mesh_vertices,
+            nb::ndarray<const int32_t, nb::shape<-1, 3>, nb::device::cpu> mesh_faces,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> mesh_vertex_offsets,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> mesh_face_offsets,
+            nb::ndarray<const float, nb::shape<-1, 2>, nb::device::cpu> mesh_texcoords,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> mesh_texcoord_offsets,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> geom_types,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> geom_groups,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> geom_data_ids,
+            nb::ndarray<const float, nb::shape<-1, 3>, nb::device::cpu> geom_sizes,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> geom_mat_ids,
+            nb::ndarray<const float, nb::shape<-1, 4>, nb::device::cpu> mat_rgba,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> mat_tex_ids,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> mat_tex_offsets,
+            nb::ndarray<const uint8_t, nb::shape<-1>, nb::device::cpu> tex_data,
+            nb::ndarray<const int64_t, nb::shape<-1>, nb::device::cpu> tex_offsets,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> tex_widths,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> tex_heights,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> tex_nchans,
             int64_t num_lights,
-            int64_t num_cams,
             int64_t num_worlds,
             int64_t batch_render_view_width,
             int64_t batch_render_view_height,
-            nb::ndarray<const float, nb::shape<-1>,
-                nb::device::cpu> cam_fovy,
-            nb::ndarray<const float, nb::shape<-1>,
-                nb::device::cpu> cam_znear,
-            nb::ndarray<const float, nb::shape<-1>,
-                nb::device::cpu> cam_zfar,
-            nb::ndarray<const int32_t, nb::shape<-1>,
-                nb::device::cpu> enabled_geom_groups,
+            nb::ndarray<const float, nb::shape<-1>, nb::device::cpu> cam_fovy,
+            nb::ndarray<const float, nb::shape<-1>, nb::device::cpu> cam_znear,
+            nb::ndarray<const float, nb::shape<-1>, nb::device::cpu> cam_zfar,
+            nb::ndarray<const int32_t, nb::shape<-1>, nb::device::cpu> enabled_geom_groups,
             bool add_cam_debug_geo,
             bool use_rt,
             VisualizerGPUHandles *viz_gpu_hdls)
@@ -87,21 +61,15 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
                 .triOffsets = (uint32_t *)mesh_face_offsets.data(),
                 .texCoords = (math::Vector2 *)mesh_texcoords.data(),
                 .texCoordOffsets = (int32_t *)mesh_texcoord_offsets.data(),
-                .texCoordNum = (uint32_t *)mesh_texcoord_num.data(),
                 .numVertices = (uint32_t)mesh_vertices.shape(0),
                 .numTris = (uint32_t)mesh_faces.shape(0),
                 .numMeshes = (uint32_t)mesh_vertex_offsets.shape(0),
             };
 
             // We need to make some copies because mgr.cpp will override
-            math::Vector4 *ptr_geom_rgba = (math::Vector4 *)malloc(
-                    sizeof(math::Vector4) * geom_rgba.shape(0));
-            int32_t *ptr_geom_mat_ids = (int32_t *)malloc(
-                    sizeof(int32_t) * geom_mat_ids.shape(0));
-            int32_t *ptr_geom_data_ids = (int32_t *)malloc(
-                    sizeof(int32_t) * geom_data_ids.shape(0));
+            int32_t *ptr_geom_mat_ids = (int32_t *)malloc(sizeof(int32_t) * geom_mat_ids.shape(0));
+            int32_t *ptr_geom_data_ids = (int32_t *)malloc(sizeof(int32_t) * geom_data_ids.shape(0));
 
-            memcpy(ptr_geom_rgba, geom_rgba.data(), sizeof(math::Vector4) * geom_rgba.shape(0));
             memcpy(ptr_geom_mat_ids, geom_mat_ids.data(), sizeof(int32_t) * geom_mat_ids.shape(0));
             memcpy(ptr_geom_data_ids, geom_data_ids.data(), sizeof(int32_t) * geom_data_ids.shape(0));
 
@@ -109,13 +77,13 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
                 .meshGeo = mesh_geo,
                 .geomTypes = (int32_t *)geom_types.data(),
                 .geomGroups = (int32_t *)geom_groups.data(),
-                .geomDataIDs = ptr_geom_data_ids,//(int32_t *)geom_data_ids.data(),
-                .geomMatIDs = ptr_geom_mat_ids,//(int32_t *)geom_mat_ids.data(),
+                .geomDataIDs = ptr_geom_data_ids,     // (int32_t *)geom_data_ids.data(),
+                .geomMatIDs = ptr_geom_mat_ids,       // (int32_t *)geom_mat_ids.data(),
                 .enabledGeomGroups = (int32_t *)enabled_geom_groups.data(),
                 .geomSizes = (math::Vector3 *)geom_sizes.data(),
-                .geomRGBA = ptr_geom_rgba,//(math::Vector4 *)geom_rgba.data(),
                 .matRGBA = (math::Vector4 *)mat_rgba.data(),
                 .matTexIDs = (int32_t *)mat_tex_ids.data(),
+                .matTexOffsets = (int32_t *)mat_tex_offsets.data(),
                 .texData = (uint8_t *) tex_data.data(),
                 .texOffsets = (int64_t *)tex_offsets.data(),
                 .texWidths = (int32_t *)tex_widths.data(),
@@ -123,8 +91,9 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
                 .texNChans = (int32_t *)tex_nchans.data(),
                 .numGeoms = (uint32_t)geom_types.shape(0),
                 .numMats = (uint32_t)mat_rgba.shape(0),
+                .numMatTextures = (uint32_t)mat_tex_ids.shape(0),
                 .numTextures = (uint32_t)tex_offsets.shape(0),
-                .numCams = (uint32_t)num_cams,
+                .numCams = (uint32_t)cam_fovy.shape(0),
                 .numLights = (uint32_t)num_lights,
                 .numEnabledGeomGroups = (uint32_t)enabled_geom_groups.shape(0),
                 .camFovy = (float *)cam_fovy.data(),
@@ -132,15 +101,18 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
                 .camZFar = (float *)cam_zfar.data(),
             };
 
-            new (self) Manager(Manager::Config {
-                .gpuID = (int)gpu_id,
-                .numWorlds = (uint32_t)num_worlds,
-                .batchRenderViewWidth = (uint32_t)batch_render_view_width,
-                .batchRenderViewHeight = (uint32_t)batch_render_view_height,
-                .addCamDebugGeometry = add_cam_debug_geo,
-                .useRT = use_rt,
-            }, gs_model, viz_gpu_hdls != nullptr ? *viz_gpu_hdls :
-                Optional<VisualizerGPUHandles>::none());
+            new (self) Manager(
+                Manager::Config {
+                    .gpuID = (int)gpu_id,
+                    .numWorlds = (uint32_t)num_worlds,
+                    .batchRenderViewWidth = (uint32_t)batch_render_view_width,
+                    .batchRenderViewHeight = (uint32_t)batch_render_view_height,
+                    .addCamDebugGeometry = add_cam_debug_geo,
+                    .useRT = use_rt,
+                },
+                gs_model,
+                viz_gpu_hdls != nullptr ? *viz_gpu_hdls : Optional<VisualizerGPUHandles>::none()
+            );
 
             free(ptr_geom_rgba);
             free(ptr_geom_mat_ids);
@@ -152,22 +124,20 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
            nb::arg("mesh_face_offsets"),
            nb::arg("mesh_texcoords"),
            nb::arg("mesh_texcoord_offsets"),
-           nb::arg("mesh_texcoord_num"),
            nb::arg("geom_types"),
            nb::arg("geom_groups"),
            nb::arg("geom_data_ids"),
            nb::arg("geom_sizes"),
            nb::arg("geom_mat_ids"),
-           nb::arg("geom_rgba"),
            nb::arg("mat_rgba"),
            nb::arg("mat_tex_ids"),
+           nb::arg("mat_tex_offsets"),
            nb::arg("tex_data"),
            nb::arg("tex_offsets"),
            nb::arg("tex_widths"),
            nb::arg("tex_heights"),
            nb::arg("tex_nchans"),
            nb::arg("num_lights"),
-           nb::arg("num_cams"),
            nb::arg("num_worlds"),
            nb::arg("batch_render_view_width"),
            nb::arg("batch_render_view_height"),
@@ -185,7 +155,7 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
                         nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> cam_pos,
                         nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 4>> cam_rot,
                         nb::ndarray<nb::pytorch, const int32_t, nb::shape<-1, -1>> mat_ids,
-                        nb::ndarray<nb::pytorch, const uint32_t, nb::shape<-1, -1>> geom_rgb,
+                        // nb::ndarray<nb::pytorch, const uint32_t, nb::shape<-1, -1>> geom_rgb,
                         nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> geom_sizes,
                         nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> light_pos,
                         nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> light_dir,
@@ -203,7 +173,7 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
                      reinterpret_cast<const math::Vector3 *>(cam_pos.data()),
                      reinterpret_cast<const math::Quat *>(cam_rot.data()),
                      reinterpret_cast<const int32_t *>(mat_ids.data()),
-                     reinterpret_cast<const uint32_t *>(geom_rgb.data()),
+                    //  reinterpret_cast<const uint32_t *>(geom_rgb.data()),
                      reinterpret_cast<const math::Diag3x3 *>(geom_sizes.data()),
                      reinterpret_cast<const math::Vector3 *>(light_pos.data()),
                      reinterpret_cast<const math::Vector3 *>(light_dir.data()),
diff --git a/src/bridge/mgr.cpp b/src/bridge/mgr.cpp
index 7945dedf..167bb256 100644
--- a/src/bridge/mgr.cpp
+++ b/src/bridge/mgr.cpp
@@ -265,11 +265,8 @@ struct Manager::Impl {
                      const float *light_attenuation,
                      const float *light_intensity)
     {
-        MWCudaLaunchGraph init_graph =
-            gpuExec.buildLaunchGraph(TaskGraphID::Init);
-        
-        MWCudaLaunchGraph render_init_graph =
-            gpuExec.buildLaunchGraph(TaskGraphID::RenderInit);
+        MWCudaLaunchGraph init_graph = gpuExec.buildLaunchGraph(TaskGraphID::Init);
+        MWCudaLaunchGraph render_init_graph = gpuExec.buildLaunchGraph(TaskGraphID::RenderInit);
 
         gpuExec.run(init_graph);
 
@@ -418,16 +415,15 @@ static RTAssets loadRenderObjects(
         },
         .objects { 0 },
         .materials { 0 },
+        .materialTextures { 0 },
         .instances { 0 },
         .textures { 0 },
     };
 
-    HeapArray<SourceMesh> meshes(
-        model.meshGeo.numMeshes + (size_t)RenderPrimObjectIDs::NumPrims);
+    HeapArray<SourceMesh> meshes(model.meshGeo.numMeshes + (size_t)RenderPrimObjectIDs::NumPrims);
     const CountT num_meshes = (CountT)model.meshGeo.numMeshes;
 
-    meshes[(size_t)RenderPrimObjectIDs::DebugCam] = 
-        disk_render_assets->objects[(size_t)RenderPrimObjectIDs::DebugCam].meshes[0];
+    meshes[(size_t)RenderPrimObjectIDs::DebugCam] = disk_render_assets->objects[(size_t)RenderPrimObjectIDs::DebugCam].meshes[0];
     meshes[(size_t)RenderPrimObjectIDs::Plane] = CreatePlane(generated_assets);
     meshes[(size_t)RenderPrimObjectIDs::Sphere] = CreateSphere(generated_assets);
     meshes[(size_t)RenderPrimObjectIDs::Box] = CreateBox(generated_assets);
@@ -475,9 +471,10 @@ static RTAssets loadRenderObjects(
     SourceTexture *out_textures = tmp_alloc.allocN<SourceTexture>(model.numTextures);
 
     for (CountT i = 0; i < model.numTextures; i++) {
+        // TODO: NChans is not used.
         uint64_t tex_offset = model.texOffsets[i];
         Optional<SourceTexture> tex = SourceTexture {
-            .data = &model.texData[tex_offset],
+            .data = model.texData + tex_offset,
             .format = SourceTextureFormat::R8G8B8A8,
             .width = (uint32_t)model.texWidths[i],
             .height = (uint32_t)model.texHeights[i],
@@ -490,25 +487,30 @@ static RTAssets loadRenderObjects(
 
     std::vector<imp::SourceMaterial> materials;
     for (CountT i = 0; i < model.numMats; i++) {
-        int32_t tex_idx = model.matTexIDs[i * 10];
+        const math::Vector4 &rgba = model.matRGBA[i];
+        uint32_t mat_tex_offset = model.matTexOffsets[i];
+        uint32_t next_tex_offset = i < model.numMats - 1 ?
+            model.matTexOffsets[i + 1] : model.numMatTextures;
+        uint32_t mat_tex_num = next_tex_offset - mat_tex_offset;
+
         SourceMaterial mat = {
-            .color = math::Vector4{
-                model.matRGBA[i].x, model.matRGBA[i].y,
-                model.matRGBA[i].z, model.matRGBA[i].w},
-            .textureIdx = tex_idx,
+            .color = math::Vector4{rgba.x, rgba.y, rgba.z, rgba.w},
+            .textureIdx = model.matTexIDs + mat_tex_offset,
+            .numTextures = mat_tex_num,
             .roughness = 0.0f,
-            .metalness = 0.0f};
+            .metalness = 0.0f
+        };
         materials.push_back(mat);
     }
 
     // Create materials for geoms that do not have one assigned
     for (CountT i = 0; i < model.numGeoms; i++) {
         if (model.geomMatIDs[i] == -1) {
+            const math::Vector4 &rgba_i = model.geomRGBA[i];
             SourceMaterial mat = {
-                .color = math::Vector4{
-                    model.geomRGBA[i].x, model.geomRGBA[i].y,
-                    model.geomRGBA[i].z, model.geomRGBA[i].w},
-                .textureIdx = -1,
+                .color = math::Vector4{rgba_i.x, rgba_i.y, rgba_i.z, rgba_i.w},
+                .textureIdx = nullptr,
+                .numTextures = 0,
                 .roughness = 0.8f,
                 .metalness = 0.2f,
             };
@@ -517,12 +519,13 @@ static RTAssets loadRenderObjects(
 
             for (CountT j = i + 1; j < model.numGeoms; j++) {
                 // FIX: Should probably implement == op for Vector4
+                const math::Vector4 &rgba_j = model.geomRGBA[j];
                 if (model.geomMatIDs[j] == -1 && 
-                    model.geomRGBA[i].x == model.geomRGBA[j].x &&
-                    model.geomRGBA[i].y == model.geomRGBA[j].y &&
-                    model.geomRGBA[i].z == model.geomRGBA[j].z &&
-                    model.geomRGBA[i].w == model.geomRGBA[j].w) 
-                {
+                    rgba_i.x == rgba_j.x &&
+                    rgba_i.y == rgba_j.y &&
+                    rgba_i.z == rgba_j.z &&
+                    rgba_i.w == rgba_j.w
+                ) {
                     model.geomMatIDs[j] = materials.size() - 1;
                 }
             }
@@ -643,44 +646,30 @@ Manager::Impl * Manager::Impl::make(
     sim_cfg.useRT = use_rt;
 
     CUcontext cu_ctx = MWCudaExecutor::initCUDA(mgr_cfg.gpuID);
-
-    Optional<RenderGPUState> render_gpu_state =
-        initRenderGPUState(mgr_cfg, viz_gpu_hdls);
-
+    Optional<RenderGPUState> render_gpu_state = initRenderGPUState(mgr_cfg, viz_gpu_hdls);
     Optional<render::RenderManager> render_mgr =
-        initRenderManager(mgr_cfg, gs_model,
-                          viz_gpu_hdls, render_gpu_state);
+        initRenderManager(mgr_cfg, gs_model, viz_gpu_hdls, render_gpu_state);
 
-    RTAssets rt_assets = loadRenderObjects(
-            gs_model, render_mgr, use_rt);
+    RTAssets rt_assets = loadRenderObjects(gs_model, render_mgr, use_rt);
     if (render_mgr.has_value()) {
         sim_cfg.renderBridge = render_mgr->bridge();
     } else {
         sim_cfg.renderBridge = nullptr;
     }
 
-    int32_t *geom_types_gpu = (int32_t *)cu::allocGPU(
-        sizeof(int32_t) * gs_model.numGeoms);
-    int32_t *geom_data_ids_gpu = (int32_t *)cu::allocGPU(
-        sizeof(int32_t) * gs_model.numGeoms);
-    Vector3 *geom_sizes_gpu = (Vector3 *)cu::allocGPU(
-        sizeof(Vector3) * gs_model.numGeoms);
+    int32_t *geom_types_gpu = (int32_t *)cu::allocGPU(sizeof(int32_t) * gs_model.numGeoms);
+    int32_t *geom_data_ids_gpu = (int32_t *)cu::allocGPU(sizeof(int32_t) * gs_model.numGeoms);
+    Vector3 *geom_sizes_gpu = (Vector3 *)cu::allocGPU(sizeof(Vector3) * gs_model.numGeoms);
     float *cam_fovy = (float *)cu::allocGPU(sizeof(float) * gs_model.numCams);
     float *cam_zfar = (float *)cu::allocGPU(sizeof(float) * gs_model.numCams);
     float *cam_znear = (float *)cu::allocGPU(sizeof(float) * gs_model.numCams);
 
-    REQ_CUDA(cudaMemcpy(geom_types_gpu, gs_model.geomTypes,
-        sizeof(int32_t) * gs_model.numGeoms, cudaMemcpyHostToDevice));
-    REQ_CUDA(cudaMemcpy(geom_data_ids_gpu, gs_model.geomDataIDs,
-        sizeof(int32_t) * gs_model.numGeoms, cudaMemcpyHostToDevice));
-    REQ_CUDA(cudaMemcpy(geom_sizes_gpu, gs_model.geomSizes,
-        sizeof(Vector3) * gs_model.numGeoms, cudaMemcpyHostToDevice));
-    REQ_CUDA(cudaMemcpy(cam_fovy, gs_model.camFovy,
-        sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice));
-    REQ_CUDA(cudaMemcpy(cam_znear, gs_model.camZNear,
-        sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice));
-    REQ_CUDA(cudaMemcpy(cam_zfar, gs_model.camZFar,
-        sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice));
+    REQ_CUDA(cudaMemcpy(geom_types_gpu, gs_model.geomTypes, sizeof(int32_t) * gs_model.numGeoms, cudaMemcpyHostToDevice));
+    REQ_CUDA(cudaMemcpy(geom_data_ids_gpu, gs_model.geomDataIDs, sizeof(int32_t) * gs_model.numGeoms, cudaMemcpyHostToDevice));
+    REQ_CUDA(cudaMemcpy(geom_sizes_gpu, gs_model.geomSizes, sizeof(Vector3) * gs_model.numGeoms, cudaMemcpyHostToDevice));
+    REQ_CUDA(cudaMemcpy(cam_fovy, gs_model.camFovy, sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice));
+    REQ_CUDA(cudaMemcpy(cam_znear, gs_model.camZNear, sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice));
+    REQ_CUDA(cudaMemcpy(cam_zfar, gs_model.camZFar, sizeof(float) * gs_model.numCams, cudaMemcpyHostToDevice));
 
     sim_cfg.geomTypes = geom_types_gpu;
     sim_cfg.geomDataIDs = geom_data_ids_gpu;
@@ -690,9 +679,7 @@ Manager::Impl * Manager::Impl::make(
     sim_cfg.camZFar = cam_zfar;
 
     HeapArray<Sim::WorldInit> world_inits(mgr_cfg.numWorlds);
-
-    Optional<CudaBatchRenderConfig> render_cfg = 
-        Optional<CudaBatchRenderConfig>::none();
+    Optional<CudaBatchRenderConfig> render_cfg = Optional<CudaBatchRenderConfig>::none();
     if (use_rt) {
         render_cfg = {
             .renderMode = CudaBatchRenderConfig::RenderMode::RGBD,
@@ -703,9 +690,7 @@ Manager::Impl * Manager::Impl::make(
         };
     }
 
-    std::vector<std::string> hideseek_srcs = {
-            GPU_HIDESEEK_SRC_LIST
-        };
+    std::vector<std::string> hideseek_srcs = { GPU_HIDESEEK_SRC_LIST };
     const char *py_root_env = getenv("MADRONA_ROOT_PATH");
     std::filesystem::path root_dir = py_root_env ? py_root_env : std::filesystem::current_path();
     std::for_each(
@@ -735,8 +720,7 @@ Manager::Impl * Manager::Impl::make(
         CompileConfig::OptMode::LTO,
     }, cu_ctx, render_cfg);
 
-    Optional<MWCudaLaunchGraph> raytrace_graph =
-        Optional<MWCudaLaunchGraph>::none();
+    Optional<MWCudaLaunchGraph> raytrace_graph = Optional<MWCudaLaunchGraph>::none();
 
     if (use_rt) {
         raytrace_graph = gpu_exec.buildRenderGraph();
diff --git a/src/bridge/mgr.hpp b/src/bridge/mgr.hpp
index 3ae2a692..a887969d 100644
--- a/src/bridge/mgr.hpp
+++ b/src/bridge/mgr.hpp
@@ -26,7 +26,6 @@ struct GSModelGeometry {
     uint32_t *triOffsets;
     madrona::math::Vector2 *texCoords;
     int32_t *texCoordOffsets;
-    uint32_t *texCoordNum;
     uint32_t numVertices;
     uint32_t numTris;
     uint32_t numMeshes;
@@ -43,6 +42,7 @@ struct GSModel {
     madrona::math::Vector4 *geomRGBA;
     madrona::math::Vector4 *matRGBA;
     int32_t *matTexIDs;
+    int32_t *matTexOffsets;
     uint8_t *texData;
     int64_t *texOffsets;
     int32_t *texWidths;
@@ -50,6 +50,7 @@ struct GSModel {
     int32_t *texNChans;
     uint32_t numGeoms;
     uint32_t numMats;
+    uint32_t numMatTextures;
     uint32_t numTextures;
     uint32_t numCams;
     uint32_t numLights;
diff --git a/src/importer/gltf.cpp b/src/importer/gltf.cpp
index 9e2977f5..1532be5e 100644
--- a/src/importer/gltf.cpp
+++ b/src/importer/gltf.cpp
@@ -1744,16 +1744,11 @@ static bool gltfImportAssets(LoaderData &loader,
     }
 
     imported.geoData.meshArrays.resize(new_mesh_arrays_start, [](auto *) {});
-    imported.geoData.positionArrays.resize(new_vert_arrays_start,
-                                           [](auto *) {});
-    imported.geoData.normalArrays.resize(new_normal_arrays_start,
-                                         [](auto *) {});
-    imported.geoData.uvArrays.resize(new_uvs_arrays_start,
-                                         [](auto *) {});
-    imported.objects.resize(new_objects_start,
-                            [](auto *) {});
-    imported.instances.resize(new_instances_start,
-                              [](auto *) {});
+    imported.geoData.positionArrays.resize(new_vert_arrays_start, [](auto *) {});
+    imported.geoData.normalArrays.resize(new_normal_arrays_start, [](auto *) {});
+    imported.geoData.uvArrays.resize(new_uvs_arrays_start, [](auto *) {});
+    imported.objects.resize(new_objects_start, [](auto *) {});
+    imported.instances.resize(new_instances_start, [](auto *) {});
 
     imported.objects.push_back({
         .meshes = Span<SourceMesh>(
@@ -1816,14 +1811,23 @@ static bool gltfImportAssets(LoaderData &loader,
         int32_t texture_id = material.baseColorIdx;
         if (texture_id != -1) {
             texture_id += prev_tex_idx;
+            imported.materialTextures.push_back(uint32_t(texture_id));
+            imported.materials.emplace_back(SourceMaterial {
+                .color = material.baseColor,
+                .textureIdx = &imported.materialTextures[imported.materialTextures.size() - 1],
+                .numTextures = 1,
+                .roughness = material.roughness,
+                .metalness = material.metallic,
+            });
+        } else {
+            imported.materials.emplace_back(SourceMaterial {
+                .color = material.baseColor,
+                .textureIdx = nullptr,
+                .numTextures = 0,
+                .roughness = material.roughness,
+                .metalness = material.metallic,
+            });
         }
-        SourceMaterial s_mat = {
-            .color = material.baseColor,
-            .textureIdx = texture_id,
-            .roughness = material.roughness,
-            .metalness = material.metallic,
-        };
-        imported.materials.emplace_back(s_mat);
     }
 
     return true;
diff --git a/src/render/batch_renderer.cpp b/src/render/batch_renderer.cpp
index b5facf1d..23bb9789 100644
--- a/src/render/batch_renderer.cpp
+++ b/src/render/batch_renderer.cpp
@@ -327,7 +327,7 @@ static PipelineMP<1> makeDrawPipeline(const vk::Device &dev,
     blend_info.pAttachments = blend_attachments.data();
 
     // Dynamic
-    std::array dyn_enable {
+    std::array<VkDynamicState, 2> dyn_enable {{
         VK_DYNAMIC_STATE_VIEWPORT,
         VK_DYNAMIC_STATE_SCISSOR,
     };
@@ -753,9 +753,7 @@ static DrawCommandPackage makeDrawCommandPackage(vk::Device& dev,
     };
     int64_t buffer_offsets[2];
 
-    int64_t num_draw_bytes = utils::computeBufferOffsets(
-        buffer_sizes, buffer_offsets, 256);
-
+    int64_t num_draw_bytes = utils::computeBufferOffsets(buffer_sizes, buffer_offsets, 256);
     vk::LocalBuffer drawBuffer = alloc.makeLocalBuffer(num_draw_bytes).value();
 
     std::array<VkWriteDescriptorSet, 6> desc_updates;
@@ -931,8 +929,7 @@ static void makeBatchFrame(vk::Device& dev,
     }
 
     uint32_t max_num_view = cfg.numWorlds * cfg.maxViewsPerWorld;
-    HeapArray<LayeredTarget> layered_targets = makeLayeredTargets(
-        cfg.renderWidth, cfg.renderHeight, max_num_view, dev, alloc);
+    HeapArray<LayeredTarget> layered_targets = makeLayeredTargets(cfg.renderWidth, cfg.renderHeight, max_num_view, dev, alloc);
     uint64_t num_pixels = static_cast<uint64_t>(max_num_view) * cfg.renderWidth * cfg.renderHeight; 
 
     {
@@ -1146,7 +1143,6 @@ static void issueRasterization(vk::Device &dev,
     rendering_info.pDepthAttachment = &depth_attach;
 
     dev.dt.cmdBeginRenderingKHR(draw_cmd, &rendering_info);
-
     dev.dt.cmdBindPipeline(draw_cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, draw_pipeline.hdls[0]);
     dev.dt.cmdBindIndexBuffer(draw_cmd, loaded_assets[0].buf.buffer,
                               loaded_assets[0].idxBufferOffset,
@@ -1160,13 +1156,10 @@ static void issueRasterization(vk::Device &dev,
         batch_frame.shadowAssetSet,
     };
 
-    dev.dt.cmdBindDescriptorSets(draw_cmd,
-                                 VK_PIPELINE_BIND_POINT_GRAPHICS,
-                                 draw_pipeline.layout,
-                                 0, 
-                                 draw_descriptors.size(),
-                                 draw_descriptors.data(), 
-                                 0, nullptr);
+    dev.dt.cmdBindDescriptorSets(
+        draw_cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, draw_pipeline.layout, 0, 
+        draw_descriptors.size(), draw_descriptors.data(), 0, nullptr
+    );
 
     uint32_t max_image_dim_x = std::min(consts::maxTextureDim, consts::maxNumImagesX * target.viewWidth);
     uint32_t max_num_image_x = max_image_dim_x / target.viewWidth;
@@ -1263,7 +1256,7 @@ static void issueDeferred(vk::Device &dev,
             sizeof(shader::DeferredLightingPushConstBR),
             &push_const);
 
-    std::array draw_descriptors = {
+    std::array<VkDescriptorSet, 3> draw_descriptors {
         batch_frame.targetsSetLighting,
         batch_frame.viewInstanceSetLighting,
         pbr_set
@@ -1275,14 +1268,10 @@ static void issueDeferred(vk::Device &dev,
                                  draw_descriptors.data(),
                                  0, nullptr);
 
-    uint32_t num_workgroups_x = utils::divideRoundUp(
-        render_dims.width, 32_u32);
-    uint32_t num_workgroups_y = utils::divideRoundUp(
-        render_dims.height, 32_u32);
+    uint32_t num_workgroups_x = utils::divideRoundUp(render_dims.width, 32_u32);
+    uint32_t num_workgroups_y = utils::divideRoundUp(render_dims.height, 32_u32);
     uint32_t num_workgroups_z = total_num_views;
-
-    dev.dt.cmdDispatch(
-        draw_cmd, num_workgroups_x, num_workgroups_y, num_workgroups_z);
+    dev.dt.cmdDispatch(draw_cmd, num_workgroups_x, num_workgroups_y, num_workgroups_z);
 }
 
 static void issueShadowGen(vk::Device &dev,
@@ -1325,8 +1314,10 @@ static void issueShadowGen(vk::Device &dev,
                             &push_const);
 
     // Descriptor sets
-    dev.dt.cmdBindDescriptorSets(draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
-        pipeline.layout, 0, 1, &frame.shadowGenSet, 0, nullptr);
+    dev.dt.cmdBindDescriptorSets(
+        draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
+        pipeline.layout, 0, 1, &frame.shadowGenSet, 0, nullptr
+    );
 
     uint32_t num_workgroups_x = utils::divideRoundUp(max_num_views, 256_u32);
     dev.dt.cmdDispatch(draw_cmd, num_workgroups_x, 1, 1);
@@ -1436,27 +1427,21 @@ static void issueShadowDraw(vk::Device &dev,
     rendering_info.pDepthAttachment = &depth_attach;
 
     dev.dt.cmdBeginRenderingKHR(draw_cmd, &rendering_info);
+    dev.dt.cmdBindPipeline(draw_cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline.hdls[0]);
+    dev.dt.cmdBindIndexBuffer(
+        draw_cmd, loaded_assets[0].buf.buffer, loaded_assets[0].idxBufferOffset, VK_INDEX_TYPE_UINT32
+    );
 
-    dev.dt.cmdBindPipeline(draw_cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
-            pipeline.hdls[0]);
-
-    dev.dt.cmdBindIndexBuffer(draw_cmd, loaded_assets[0].buf.buffer,
-            loaded_assets[0].idxBufferOffset,
-            VK_INDEX_TYPE_UINT32);
-
-    std::array draw_descriptors {
+    std::array<VkDescriptorSet, 3> draw_descriptors {
         batch_frame.shadowDrawSet,
         view_batch.drawBufferSetDraw,
         asset_set,
     };
 
-    dev.dt.cmdBindDescriptorSets(draw_cmd,
-                                 VK_PIPELINE_BIND_POINT_GRAPHICS,
-                                 pipeline.layout,
-                                 0,
-                                 draw_descriptors.size(),
-                                 draw_descriptors.data(),
-                                 0, nullptr);
+    dev.dt.cmdBindDescriptorSets(
+        draw_cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline.layout, 0,
+        draw_descriptors.size(), draw_descriptors.data(), 0, nullptr
+    );
 
     uint32_t max_image_dim_x = std::min(consts::maxTextureDim, consts::maxNumImagesX * target.viewWidth);
     uint32_t max_num_image_x = max_image_dim_x / target.viewWidth;
@@ -1651,7 +1636,7 @@ BatchRenderer::Impl::Impl(const Config &cfg, RenderContext &rctx):
     batchFrames(cfg.numFrames),
     assetSetPrepare(rctx.asset_set_cull_),
     assetSetDraw(rctx.asset_set_draw_),
-    assetSetTextureMat(rctx.asset_set_tex_compute_),
+    assetSetTextureMat(rctx.asset_set_mat_tex_),
     assetSetLighting(rctx.asset_batch_lighting_set_),
     renderExtent { cfg.renderWidth, cfg.renderHeight },
     selectedView(0),
@@ -1781,11 +1766,10 @@ static void issuePrepareViewsPipeline(vk::Device& dev,
 
     (void)num_views;
     (void)num_processed_batches;
-    dev.dt.cmdBindPipeline(draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
-                           prepare_views.hdls[0]);
+    dev.dt.cmdBindPipeline(draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE, prepare_views.hdls[0]);
 
     { // Dispatch the compute shader
-        std::array view_gen_descriptors = {
+        std::array<VkDescriptorSet, 4> view_gen_descriptors = {
             frame.viewInstanceSetPrepare,
             batch.drawBufferSetPrepare,
             assetSetPrepareView,
@@ -1793,11 +1777,13 @@ static void issuePrepareViewsPipeline(vk::Device& dev,
             rctx.loaded_assets_[0].aabbSet
         };
 
-        dev.dt.cmdBindDescriptorSets(draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
-                                     prepare_views.layout, 0,
-                                     view_gen_descriptors.size(),
-                                     view_gen_descriptors.data(),
-                                     0, nullptr);
+        dev.dt.cmdBindDescriptorSets(
+            draw_cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
+            prepare_views.layout, 0,
+            view_gen_descriptors.size(),
+            view_gen_descriptors.data(),
+            0, nullptr
+        );
 
         shader::PrepareViewPushConstant view_push_const = {
             num_views, view_start, num_worlds, num_instances,
diff --git a/src/render/render_ctx.cpp b/src/render/render_ctx.cpp
index fc2eda24..d190e842 100644
--- a/src/render/render_ctx.cpp
+++ b/src/render/render_ctx.cpp
@@ -77,8 +77,7 @@ void initCommonDrawPipelineInfo(
     VkPipelineRasterizationStateCreateInfo &raster_info) 
 {
     // Disable auto vertex assembly
-    vert_info.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
+    vert_info.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
     vert_info.pNext = nullptr;
     vert_info.flags = 0;
     vert_info.vertexBindingDescriptionCount = 0;
@@ -87,30 +86,26 @@ void initCommonDrawPipelineInfo(
     vert_info.pVertexAttributeDescriptions = nullptr;
 
     // Assembly (standard tri indices)
-    input_assembly_info.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
+    input_assembly_info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
     input_assembly_info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
     input_assembly_info.primitiveRestartEnable = VK_FALSE;
 
     // Viewport (fully dynamic)
-    viewport_info.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
+    viewport_info.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
     viewport_info.viewportCount = 1;
     viewport_info.pViewports = nullptr;
     viewport_info.scissorCount = 1;
     viewport_info.pScissors = nullptr;
 
     // Multisample
-    multisample_info.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
+    multisample_info.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
     multisample_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
     multisample_info.sampleShadingEnable = VK_FALSE;
     multisample_info.alphaToCoverageEnable = VK_FALSE;
     multisample_info.alphaToOneEnable = VK_FALSE;
 
     // Rasterization
-    raster_info.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
+    raster_info.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
     raster_info.depthClampEnable = VK_FALSE;
     raster_info.rasterizerDiscardEnable = VK_FALSE;
     raster_info.polygonMode = VK_POLYGON_MODE_FILL;
@@ -285,12 +280,12 @@ static PipelineShaders makeDrawShaders(
 
     ShaderCompiler compiler;
     SPIRVShader vert_spirv = compiler.compileHLSLFileToSPV(
-        shader_path.c_str(), {}, {},
-        { "vert", ShaderStage::Vertex });
+        shader_path.c_str(), {}, {}, { "vert", ShaderStage::Vertex }
+    );
 
     SPIRVShader frag_spirv = compiler.compileHLSLFileToSPV(
-        shader_path.c_str(), {}, {},
-        { "frag", ShaderStage::Fragment });
+        shader_path.c_str(), {}, {}, { "frag", ShaderStage::Fragment }
+    );
 
 #if 0
             {0, 2, repeat_sampler, 1, 0},
@@ -349,9 +344,7 @@ static Pipeline<1> makeDrawPipeline(const Device &dev,
                                     VkSampler clamp_sampler,
                                     uint32_t num_frames)
 {
-    auto shaders =
-        makeDrawShaders(dev, repeat_sampler, clamp_sampler);
-
+    auto shaders = makeDrawShaders(dev, repeat_sampler, clamp_sampler);
     VkPipelineVertexInputStateCreateInfo vert_info {};
     VkPipelineInputAssemblyStateCreateInfo input_assembly_info {};
     VkPipelineViewportStateCreateInfo viewport_info {};
@@ -363,8 +356,7 @@ static Pipeline<1> makeDrawPipeline(const Device &dev,
 
     // Depth/Stencil
     VkPipelineDepthStencilStateCreateInfo depth_info {};
-    depth_info.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
+    depth_info.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
     depth_info.depthTestEnable = VK_TRUE;
     depth_info.depthWriteEnable = VK_TRUE;
     depth_info.depthCompareOp = VK_COMPARE_OP_GREATER_OR_EQUAL;
@@ -423,16 +415,13 @@ static Pipeline<1> makeDrawPipeline(const Device &dev,
     gfx_layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
     gfx_layout_info.pNext = nullptr;
     gfx_layout_info.flags = 0;
-    gfx_layout_info.setLayoutCount =
-        static_cast<uint32_t>(draw_desc_layouts.size());
+    gfx_layout_info.setLayoutCount = static_cast<uint32_t>(draw_desc_layouts.size());
     gfx_layout_info.pSetLayouts = draw_desc_layouts.data();
     gfx_layout_info.pushConstantRangeCount = 1;
     gfx_layout_info.pPushConstantRanges = &push_const;
 
     VkPipelineLayout draw_layout;
-    REQ_VK(dev.dt.createPipelineLayout(dev.hdl, &gfx_layout_info, nullptr,
-                                       &draw_layout));
-
+    REQ_VK(dev.dt.createPipelineLayout(dev.hdl, &gfx_layout_info, nullptr, &draw_layout));
     array<VkPipelineShaderStageCreateInfo, 2> gfx_stages {{
         {
             VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
@@ -755,12 +744,8 @@ static EngineInterop setupEngineInterop(Device &dev,
             view_offsets_base = view_offsets_cpu->ptr;
         } else {
 #ifdef MADRONA_VK_CUDA_SUPPORT
-            view_offsets_gpu = alloc.makeDedicatedBuffer(
-                num_offsets_bytes, false, true);
-
-            view_offsets_cuda.emplace(dev, view_offsets_gpu->mem,
-                num_offsets_bytes);
-
+            view_offsets_gpu = alloc.makeDedicatedBuffer(num_offsets_bytes, false, true);
+            view_offsets_cuda.emplace(dev, view_offsets_gpu->mem, num_offsets_bytes);
             view_offsets_hdl = view_offsets_gpu->buf.buffer;
             view_offsets_base = (char *)view_offsets_cuda->getDevicePointer();
 #endif
@@ -776,12 +761,8 @@ static EngineInterop setupEngineInterop(Device &dev,
             light_offsets_base = light_offsets_cpu->ptr;
         } else {
 #ifdef MADRONA_VK_CUDA_SUPPORT
-            light_offsets_gpu = alloc.makeDedicatedBuffer(
-                num_offsets_bytes, false, true);
-
-            light_offsets_cuda.emplace(dev, light_offsets_gpu->mem,
-                num_offsets_bytes);
-
+            light_offsets_gpu = alloc.makeDedicatedBuffer(num_offsets_bytes, false, true);
+            light_offsets_cuda.emplace(dev, light_offsets_gpu->mem, num_offsets_bytes);
             light_offsets_hdl = light_offsets_gpu->buf.buffer;
             light_offsets_base = (char *)light_offsets_cuda->getDevicePointer();
 #endif
@@ -807,15 +788,10 @@ static EngineInterop setupEngineInterop(Device &dev,
         voxel_buffer_hdl = voxel_cpu->buffer;
     } else {
 #ifdef MADRONA_VK_CUDA_SUPPORT
-        voxel_gpu = alloc.makeDedicatedBuffer(
-            staging_size, false, true);
-
-        voxel_cuda.emplace(
-            dev, voxel_gpu->mem, staging_size);
-
+        voxel_gpu = alloc.makeDedicatedBuffer(staging_size, false, true);
+        voxel_cuda.emplace(dev, voxel_gpu->mem, staging_size);
         voxel_buffer_hdl = voxel_gpu->buf.buffer;
-        voxel_buffer_ptr = num_voxels ?
-            (uint32_t *)voxel_cuda->getDevicePointer() : nullptr;
+        voxel_buffer_ptr = num_voxels ? (uint32_t *)voxel_cuda->getDevicePointer() : nullptr;
 #endif
     }
 
@@ -842,8 +818,7 @@ static EngineInterop setupEngineInterop(Device &dev,
         total_num_lights_cpu_inc->store_release(0);
     } else {
 #ifdef MADRONA_VK_CUDA_SUPPORT
-        total_num_views_readback = (uint32_t *)cu::allocReadback(
-            3*sizeof(uint32_t));
+        total_num_views_readback = (uint32_t *)cu::allocReadback(3*sizeof(uint32_t));
         total_num_instances_readback = total_num_views_readback + 1;
         total_num_lights_readback = total_num_instances_readback + 1;
 #endif
@@ -1294,56 +1269,51 @@ static Sky loadSky(const vk::Device &dev, MemoryAllocator &alloc, VkQueue queue)
 }
 
 RenderContext::RenderContext(
-        APIBackend *render_backend,
-        GPUDevice *render_dev,
-        const RenderManager::Config &cfg)
-    : backend(*static_cast<vk::Backend *>(render_backend)),
-      dev(static_cast<vk::Device &>(*render_dev)),
-      alloc(dev, backend),
-      renderQueue(makeGFXQueue(dev, 0)),
-      br_width_(cfg.agentViewWidth),
-      br_height_(cfg.agentViewHeight),
-      pipelineCache(getPipelineCache(dev)),
-      repeatSampler(
-          makeImmutableSampler(dev, VK_SAMPLER_ADDRESS_MODE_REPEAT)),
-      clampSampler(
-          makeImmutableSampler(dev, VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE)),
-      renderPass(makeRenderPass(
-          dev, VK_FORMAT_R8G8B8A8_UNORM, InternalConfig::gbufferFormat,
-          InternalConfig::gbufferFormat, InternalConfig::depthFormat)),
-      shadowPass(makeShadowRenderPass(
-          dev, InternalConfig::varianceFormat, InternalConfig::depthFormat)),
-      instanceCull(makeCullPipeline(
-          dev, pipelineCache, InternalConfig::numFrames)),
-      objectDraw(makeDrawPipeline(dev, pipelineCache,
-          renderPass, repeatSampler, clampSampler,
-          InternalConfig::numFrames)),
-      asset_desc_pool_cull_(dev, instanceCull.shaders, 1, 1),
-      asset_desc_pool_draw_(dev, objectDraw.shaders, 1, 1),
-      asset_desc_pool_mat_tx_(dev, objectDraw.shaders, 2, 1),
-      asset_set_cull_(asset_desc_pool_cull_.makeSet()),
-      asset_set_draw_(asset_desc_pool_draw_.makeSet()),
-      asset_set_mat_tex_(asset_desc_pool_mat_tx_.makeSet()),
-      load_cmd_pool_(makeCmdPool(dev, dev.gfxQF)),
-      load_cmd_(makeCmdBuffer(dev, load_cmd_pool_)),
-      load_fence_(makeFence(dev)),
-      engine_interop_(setupEngineInterop(
-          dev, alloc, cfg.execMode == ExecMode::CUDA, cfg.numWorlds,
-          cfg.maxViewsPerWorld, cfg.maxInstancesPerWorld,
-          cfg.maxLightsPerWorld,
-          br_width_, br_height_, cfg.voxelCfg)),
-      lights_(InternalConfig::maxLights),
-      loaded_assets_(0),
-      sky_(loadSky(dev, alloc, renderQueue)),
-      material_textures_(0),
-      voxel_config_(cfg.voxelCfg),
-      num_worlds_(cfg.numWorlds),
-      gpu_input_(cfg.execMode == ExecMode::CUDA)
+    APIBackend *render_backend,
+    GPUDevice *render_dev,
+    const RenderManager::Config &cfg
+):  backend(*static_cast<vk::Backend *>(render_backend)),
+    dev(static_cast<vk::Device &>(*render_dev)),
+    alloc(dev, backend),
+    renderQueue(makeGFXQueue(dev, 0)),
+    br_width_(cfg.agentViewWidth),
+    br_height_(cfg.agentViewHeight),
+    pipelineCache(getPipelineCache(dev)),
+    repeatSampler(makeImmutableSampler(dev, VK_SAMPLER_ADDRESS_MODE_REPEAT)),
+    clampSampler(makeImmutableSampler(dev, VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE)),
+    renderPass(makeRenderPass(
+        dev, VK_FORMAT_R8G8B8A8_UNORM, InternalConfig::gbufferFormat,
+        InternalConfig::gbufferFormat, InternalConfig::depthFormat)),
+    shadowPass(makeShadowRenderPass(
+        dev, InternalConfig::varianceFormat, InternalConfig::depthFormat)),
+    instanceCull(makeCullPipeline(dev, pipelineCache, InternalConfig::numFrames)),
+    objectDraw(makeDrawPipeline(dev, pipelineCache, renderPass, repeatSampler, clampSampler, InternalConfig::numFrames)),
+    asset_desc_pool_cull_(dev, instanceCull.shaders, 1, 1),
+    asset_desc_pool_draw_(dev, objectDraw.shaders, 1, 1),
+    asset_desc_pool_mat_tx_(dev, objectDraw.shaders, 2, 1),
+    asset_set_cull_(asset_desc_pool_cull_.makeSet()),
+    asset_set_draw_(asset_desc_pool_draw_.makeSet()),
+    asset_set_mat_tex_(asset_desc_pool_mat_tx_.makeSet()),
+    load_cmd_pool_(makeCmdPool(dev, dev.gfxQF)),
+    load_cmd_(makeCmdBuffer(dev, load_cmd_pool_)),
+    load_fence_(makeFence(dev)),
+    engine_interop_(setupEngineInterop(
+        dev, alloc, cfg.execMode == ExecMode::CUDA, cfg.numWorlds,
+        cfg.maxViewsPerWorld, cfg.maxInstancesPerWorld,
+        cfg.maxLightsPerWorld,
+        br_width_, br_height_, cfg.voxelCfg)),
+    lights_(InternalConfig::maxLights),
+    loaded_assets_(0),
+    sky_(loadSky(dev, alloc, renderQueue)),
+    material_textures_(0),
+    voxel_config_(cfg.voxelCfg),
+    num_worlds_(cfg.numWorlds),
+    gpu_input_(cfg.execMode == ExecMode::CUDA)
 {
     {
         VkDescriptorPoolSize pool_sizes[] = {
             { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 25 },
-            { VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, InternalConfig::maxTextures*2 },
+            { VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, InternalConfig::maxTextures * 2 },
             { VK_DESCRIPTOR_TYPE_SAMPLER, 1 }
         };
 
@@ -1353,8 +1323,7 @@ RenderContext::RenderContext(
         pool_info.maxSets = 10 + InternalConfig::maxTextures + 1;
         pool_info.poolSizeCount = 3;
         pool_info.pPoolSizes = pool_sizes;
-        REQ_VK(dev.dt.createDescriptorPool(dev.hdl,
-            &pool_info, nullptr, &asset_pool_));
+        REQ_VK(dev.dt.createDescriptorPool(dev.hdl, &pool_info, nullptr, &asset_pool_));
     }
 
     {
@@ -1488,16 +1457,22 @@ RenderContext::RenderContext(
                 .binding = 0,
                 .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
                 .descriptorCount = InternalConfig::maxTextures,
-                .stageFlags =  VK_SHADER_STAGE_FRAGMENT_BIT,
+                .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
                 .pImmutableSamplers = nullptr,
             },
-
             {
                 .binding = 1,
                 .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER,
                 .descriptorCount = 1,
-                .stageFlags =  VK_SHADER_STAGE_FRAGMENT_BIT,
+                .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
                 .pImmutableSamplers = &repeatSampler
+            },
+            {
+                .binding = 2,
+                .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+                .descriptorCount = 1,
+                .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
+                .pImmutableSamplers = nullptr,
             }
         };
 
@@ -1505,7 +1480,7 @@ RenderContext::RenderContext(
             .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
             .pNext = &flag_info,
             .flags = 0,
-            .bindingCount = 2,
+            .bindingCount = 3,
             .pBindings = bindings
         };
 
@@ -1518,12 +1493,8 @@ RenderContext::RenderContext(
             .pNext = nullptr,
             .descriptorPool = asset_pool_,
             .descriptorSetCount = 1,
-            .pSetLayouts = &asset_tex_layout_
+            .pSetLayouts = &asset_batch_lighting_layout_
         };
-
-        dev.dt.allocateDescriptorSets(dev.hdl, &alloc_info, &asset_set_tex_compute_);
-
-        alloc_info.pSetLayouts = &asset_batch_lighting_layout_;
         dev.dt.allocateDescriptorSets(dev.hdl, &alloc_info, &asset_batch_lighting_set_);
     }
 
@@ -1564,16 +1535,16 @@ RenderContext::~RenderContext()
     dev.dt.destroyImageView(dev.hdl, sky_.mieView, nullptr);
     dev.dt.destroyImageView(dev.hdl, sky_.scatteringView, nullptr);
 
-    dev.dt.freeMemory(dev.hdl, sky_.transmittanceBacking, nullptr);
-    dev.dt.freeMemory(dev.hdl, sky_.irradianceBacking, nullptr);
-    dev.dt.freeMemory(dev.hdl, sky_.mieBacking, nullptr);
-    dev.dt.freeMemory(dev.hdl, sky_.scatteringBacking, nullptr);
-
     dev.dt.destroyImage(dev.hdl, sky_.transmittance.image, nullptr);
     dev.dt.destroyImage(dev.hdl, sky_.irradiance.image, nullptr);
     dev.dt.destroyImage(dev.hdl, sky_.singleMieScattering.image, nullptr);
     dev.dt.destroyImage(dev.hdl, sky_.scattering.image, nullptr);
 
+    dev.dt.freeMemory(dev.hdl, sky_.transmittanceBacking, nullptr);
+    dev.dt.freeMemory(dev.hdl, sky_.irradianceBacking, nullptr);
+    dev.dt.freeMemory(dev.hdl, sky_.mieBacking, nullptr);
+    dev.dt.freeMemory(dev.hdl, sky_.scatteringBacking, nullptr);
+
     dev.dt.destroyFence(dev.hdl, load_fence_, nullptr);
     dev.dt.destroyCommandPool(dev.hdl, load_cmd_pool_, nullptr);
 
@@ -1774,42 +1745,32 @@ static DynArray<MaterialTexture> loadTextures(
 
     dev.dt.beginCommandBuffer(cmdbuf, &begin_info);
 
-    for (const imp::SourceTexture &tx : textures)
-    {
+    for (const imp::SourceTexture &tx : textures) {
         if (tx.format == imp::SourceTextureFormat::BC7) {
             void *pixel_data = tx.data;
             uint32_t pixel_data_size = tx.numBytes;
-
-            uint32_t width = tx.width,
-                     height = tx.height;
-
-            auto [texture, texture_reqs] = alloc.makeTexture2D(
-                    width, height, 1, VK_FORMAT_BC7_UNORM_BLOCK);
+            uint32_t width = tx.width;
+            uint32_t height = tx.height;
+            auto [texture, texture_reqs] = alloc.makeTexture2D(width, height, 1, VK_FORMAT_BC7_UNORM_BLOCK);
 
             HostBuffer texture_hb_staging = alloc.makeStagingBuffer(texture_reqs.size);
             memcpy(texture_hb_staging.ptr, pixel_data, pixel_data_size);
             texture_hb_staging.flush(dev);
-
             std::optional<VkDeviceMemory> texture_backing = alloc.alloc(texture_reqs.size);
-
             assert(texture_backing.has_value());
-
             dev.dt.bindImageMemory(dev.hdl, texture.image, texture_backing.value(), 0);
 
             VkImageMemoryBarrier copy_prepare {
                 VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
-                    nullptr,
-                    0,
-                    VK_ACCESS_MEMORY_WRITE_BIT,
-                    VK_IMAGE_LAYOUT_UNDEFINED,
-                    VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
-                    VK_QUEUE_FAMILY_IGNORED,
-                    VK_QUEUE_FAMILY_IGNORED,
-                    texture.image,
-                    {
-                        VK_IMAGE_ASPECT_COLOR_BIT,
-                        0, 1, 0, 1
-                    },
+                nullptr,
+                0,
+                VK_ACCESS_MEMORY_WRITE_BIT,
+                VK_IMAGE_LAYOUT_UNDEFINED,
+                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                VK_QUEUE_FAMILY_IGNORED,
+                VK_QUEUE_FAMILY_IGNORED,
+                texture.image,
+                { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 },
             };
 
             dev.dt.cmdPipelineBarrier(cmdbuf,
@@ -1836,18 +1797,15 @@ static DynArray<MaterialTexture> loadTextures(
 
             VkImageMemoryBarrier finish_prepare {
                 VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
-                    nullptr,
-                    VK_ACCESS_MEMORY_WRITE_BIT,
-                    VK_ACCESS_SHADER_READ_BIT,
-                    VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
-                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
-                    VK_QUEUE_FAMILY_IGNORED,
-                    VK_QUEUE_FAMILY_IGNORED,
-                    texture.image,
-                    {
-                        VK_IMAGE_ASPECT_COLOR_BIT,
-                        0, 1, 0, 1
-                    },
+                nullptr,
+                VK_ACCESS_MEMORY_WRITE_BIT,
+                VK_ACCESS_SHADER_READ_BIT,
+                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                VK_QUEUE_FAMILY_IGNORED,
+                VK_QUEUE_FAMILY_IGNORED,
+                texture.image,
+                { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 },
             };
 
 
@@ -1881,23 +1839,16 @@ static DynArray<MaterialTexture> loadTextures(
             uint32_t height = tx.height;
 
             uint32_t mip_levels = std::max(1, (int32_t)std::floor(std::log2(std::max(width, height))));
-
-            auto [texture, texture_reqs] = alloc.makeTexture2D(
-                    width, height, mip_levels, VK_FORMAT_R8G8B8A8_SRGB);
+            auto [texture, texture_reqs] = alloc.makeTexture2D(width, height, mip_levels, VK_FORMAT_R8G8B8A8_SRGB);
 
             HostBuffer texture_hb_staging = alloc.makeStagingBuffer(texture_reqs.size);
             memcpy(texture_hb_staging.ptr, pixels, width * height * 4 * sizeof(char));
             texture_hb_staging.flush(dev);
-
             std::optional<VkDeviceMemory> texture_backing = alloc.alloc(texture_reqs.size);
-
             assert(texture_backing.has_value());
-
             dev.dt.bindImageMemory(dev.hdl, texture.image, texture_backing.value(), 0);
-
             copyBufferToImage(dev, cmdbuf, texture_hb_staging, texture.image, width, height, mip_levels);
             host_buffers.push_back(std::move(texture_hb_staging));
-
             generateMipmaps(dev, cmdbuf, texture.image, width, height, mip_levels);
 
             VkImageViewCreateInfo view_info {};
@@ -1914,8 +1865,6 @@ static DynArray<MaterialTexture> loadTextures(
             view_info.image = texture.image;
             view_info.format = VK_FORMAT_R8G8B8A8_SRGB;
             REQ_VK(dev.dt.createImageView(dev.hdl, &view_info, nullptr, &view));
-
-
             dst_textures.emplace_back(std::move(texture), view, texture_backing.value());
         }
     }
@@ -1957,6 +1906,9 @@ CountT RenderContext::loadObjects(Span<const imp::SourceObject> src_objs,
     int64_t num_total_vertices = 0;
     int64_t num_total_indices = 0;
     int64_t num_total_meshes = 0;
+    int64_t num_total_objs = src_objs.size();
+    int64_t num_total_materials = src_mats.size();
+    int64_t num_total_textures = 0;
 
     for (const SourceObject &obj : src_objs) {
         num_total_meshes += obj.meshes.size();
@@ -1968,41 +1920,32 @@ CountT RenderContext::loadObjects(Span<const imp::SourceObject> src_objs,
 
             num_total_vertices += mesh.numVertices;
             num_total_indices += mesh.numFaces * 3;
+            num_total_textures += src_mats[mesh.materialIDX].numTextures;
         }
     }
 
-    int64_t num_total_objs = src_objs.size();
 
-    int64_t buffer_offsets[5];
-    int64_t buffer_sizes[6] = {
+    int64_t buffer_offsets[6];
+    int64_t buffer_sizes[7] = {
         (int64_t)sizeof(ObjectData) * num_total_objs,
         (int64_t)sizeof(MeshData) * num_total_meshes,
         (int64_t)sizeof(PackedVertex) * num_total_vertices,
         (int64_t)sizeof(uint32_t) * num_total_indices,
-        (int64_t)sizeof(MaterialDataShader) * src_mats.size(),
-        (int64_t)sizeof(ShaderAABB) * num_total_objs
+        (int64_t)sizeof(MaterialDataShader) * num_total_materials,
+        (int64_t)sizeof(ShaderAABB) * num_total_objs,
+        (int64_t)sizeof(uint32_t) * num_total_textures
     };
 
-    int64_t num_asset_bytes = utils::computeBufferOffsets(
-        buffer_sizes, buffer_offsets, 256);
-
+    int64_t num_asset_bytes = utils::computeBufferOffsets(buffer_sizes, buffer_offsets, 256);
     HostBuffer staging = alloc.makeStagingBuffer(num_asset_bytes);
     char *staging_ptr = (char *)staging.ptr;
     ObjectData *obj_ptr = (ObjectData *)staging_ptr;
-    MeshData *mesh_ptr = 
-        (MeshData *)(staging_ptr + buffer_offsets[0]);
-    PackedVertex *vertex_ptr =
-        (PackedVertex *)(staging_ptr + buffer_offsets[1]);
-    uint32_t *indices_ptr =
-        (uint32_t *)(staging_ptr + buffer_offsets[2]);
-    MaterialDataShader *materials_ptr =
-        (MaterialDataShader *)(staging_ptr + buffer_offsets[3]);
-    ShaderAABB *aabbs_ptr =
-        (ShaderAABB *)(staging_ptr + buffer_offsets[4]);
-
-    int32_t mesh_offset = 0;
-    int32_t vertex_offset = 0;
-    int32_t index_offset = 0;
+    MeshData *mesh_ptr = (MeshData *)(staging_ptr + buffer_offsets[0]);
+    PackedVertex *vertex_ptr = (PackedVertex *)(staging_ptr + buffer_offsets[1]);
+    uint32_t *indices_ptr = (uint32_t *)(staging_ptr + buffer_offsets[2]);
+    MaterialDataShader *materials_ptr = (MaterialDataShader *)(staging_ptr + buffer_offsets[3]);
+    ShaderAABB *aabbs_ptr = (ShaderAABB *)(staging_ptr + buffer_offsets[4]);
+    uint32_t *mat_textures_ptr = (uint32_t *)(staging_ptr + buffer_offsets[5]);
 
     auto packHalf2x16 = [](const Vector2 &v) {
 #if defined(MADRONA_MSVC)
@@ -2020,37 +1963,35 @@ CountT RenderContext::loadObjects(Span<const imp::SourceObject> src_objs,
         y_half = v.y;
 #endif
 
-        return uint32_t(std::bit_cast<uint16_t>(y_half)) << 16 |
-            uint32_t(std::bit_cast<uint16_t>(x_half));
+        return uint32_t(std::bit_cast<uint16_t>(y_half)) << 16 | uint32_t(std::bit_cast<uint16_t>(x_half));
     };
 
+    int32_t obj_offset = 0;
+    int32_t mesh_offset = 0;
+    int32_t vertex_offset = 0;
+    int32_t index_offset = 0;
     for (const SourceObject &obj : src_objs) {
-        *obj_ptr++ = ObjectData {
-            .meshOffset = mesh_offset,
-            .numMeshes = (int32_t)obj.meshes.size(),
-        };
+        ObjectData *obj_data = obj_ptr + (obj_offset++);
+        obj_data->meshOffset = mesh_offset;
+        obj_data->numMeshes = (int32_t)obj.meshes.size();
 
         for (const SourceMesh &mesh : obj.meshes) {
             uint32_t material_idx = mesh.materialIDX;
-
             int32_t num_mesh_verts = (int32_t)mesh.numVertices;
             int32_t num_mesh_indices = (int32_t)mesh.numFaces * 3;
 
-            MeshData mesh_data = MeshData {};
-            mesh_data.vertexOffset = vertex_offset;
-            mesh_data.numVertices = num_mesh_verts;
-            mesh_data.indexOffset = index_offset;
-            mesh_data.numIndices = num_mesh_indices;
-            mesh_data.materialIndex = (int32_t)material_idx;
-
-            mesh_ptr[mesh_offset++] = mesh_data;
+            MeshData *mesh_data = mesh_ptr + (mesh_offset++);
+            mesh_data->vertexOffset = vertex_offset;
+            mesh_data->numVertices = num_mesh_verts;
+            mesh_data->indexOffset = index_offset;
+            mesh_data->numIndices = num_mesh_indices;
+            mesh_data->materialIndex = (int32_t)material_idx;
 
             // Compute new normals
             HeapArray<Vector3> new_normals(num_mesh_verts);
             memset(new_normals.data(), 0, num_mesh_verts * sizeof(Vector3));
 
-            for (CountT face_idx = 0; face_idx < (CountT)mesh.numFaces;
-                    face_idx++) {
+            for (CountT face_idx = 0; face_idx < (CountT)mesh.numFaces; face_idx++) {
                 CountT base_idx = face_idx * 3;
                 uint32_t i0 = mesh.indices[base_idx];
                 uint32_t i1 = mesh.indices[base_idx + 1];
@@ -2062,7 +2003,6 @@ CountT RenderContext::loadObjects(Span<const imp::SourceObject> src_objs,
 
                 Vector3 e0 = v1 - v0;
                 Vector3 e1 = v2 - v0;
-
                 Vector3 face_normal = cross(e0, e1);
 
                 new_normals[i0] += face_normal;     // align with pyrender
@@ -2070,98 +2010,70 @@ CountT RenderContext::loadObjects(Span<const imp::SourceObject> src_objs,
                 new_normals[i2] += face_normal;
             }
 
-            for (int64_t vert_idx = 0; vert_idx < num_mesh_verts;
-                    vert_idx++) {
-                if (new_normals[vert_idx].length() == 0.f) {
-                    new_normals[vert_idx] = math::up;
+            for (CountT vert_idx = 0; vert_idx < num_mesh_verts; vert_idx++) {
+                Vector3 pos = mesh.positions[vert_idx];
+
+                // set normals
+                Vector3 normal = new_normals[vert_idx];
+                if (normal.length() == 0.f) {
+                    normal = math::up;
                 } else {
-                    new_normals[vert_idx] = normalize(new_normals[vert_idx]);
+                    normal = normalize(normal);
                 }
-            }
-
-            for (int32_t i = 0; i < num_mesh_verts; i++) {
-                Vector3 pos = mesh.positions[i];
-                Vector3 normal = new_normals[i];
-                if(mesh.normals) {
-                    mesh.normals[i] = normal;
+                new_normals[vert_idx] = normal;
+                if (mesh.normals) {
+                    mesh.normals[vert_idx] = normal;
                 }
 
+                // set tangent and sign
                 Vector3 a, b;
                 normal.frame(&a, &b);
-                Vector4 tangent_sign = {
-                    a.x,
-                    a.y,
-                    a.z,
-                    1.f,
-                };
-                if(mesh.tangentAndSigns) {
-                    mesh.tangentAndSigns[i] = tangent_sign;
+                Vector4 tangent_sign = { a.x, a.y, a.z, 1.f };
+                if (mesh.tangentAndSigns) {
+                    mesh.tangentAndSigns[vert_idx] = tangent_sign;
                 }
 
-                Vector2 uv = mesh.uvs ? mesh.uvs[i] : Vector2 { 0, 0 };
-
-                Vector3 encoded_normal_tangent =
-                    encodeNormalTangent(normal, tangent_sign);
+                Vector2 uv = mesh.uvs ? mesh.uvs[vert_idx] : Vector2 { 0, 0 };
 
                 // Encode UVs into a uint32 (how bad will this look on small images??
                 // Let's see I guess.
-                float encoded_uvs = std::bit_cast<float>(packHalf2x16(uv));
-
-                vertex_ptr[vertex_offset++] = PackedVertex {
-                    Vector4 {
-                        pos.x,
-                        pos.y,
-                        pos.z,
-                        encoded_normal_tangent.x,
-                    },
-                    Vector4 {
-                        encoded_normal_tangent.y,
-                        encoded_normal_tangent.z,
-                        encoded_uvs,
-                        0
-                    },
-                };
+                Vector3 enc_nt = encodeNormalTangent(normal, tangent_sign);
+                float enc_uvs = std::bit_cast<float>(packHalf2x16(uv));
+                PackedVertex *vertex_data = vertex_ptr + (vertex_offset++);
+                vertex_data->data[0] = Vector4 { pos.x, pos.y, pos.z, enc_nt.x };
+                vertex_data->data[1] = Vector4 { enc_nt.y, enc_nt.z, enc_uv, 0 };
             }
 
-            memcpy(indices_ptr + index_offset,
-                   mesh.indices, sizeof(uint32_t) * num_mesh_indices);
-
+            memcpy(indices_ptr + index_offset, mesh.indices, sizeof(uint32_t) * num_mesh_indices);
             index_offset += num_mesh_indices;
         }
     }
 
-    uint32_t mat_idx = 0;
+    int32_t mat_offset = 0;
+    int32_t mat_texture_offset = 0;
     for (const SourceMaterial &mat : src_mats) {
-        materials_ptr[mat_idx].color = mat.color;
-        materials_ptr[mat_idx].roughness = mat.roughness;
-        materials_ptr[mat_idx].metalness = mat.metalness;
-        materials_ptr[mat_idx++].textureIdx = mat.textureIdx;
+        int32_t num_mat_textures = (int32_t)mat.numTextures;
+        MaterialData *mat_data = materials_ptr + (mat_offset++);
+        mat_data->color = mat.color;
+        mat_data->roughness = mat.roughness;
+        mat_data->metalness = mat.metalness;
+        mat_data->textureOffset = mat_texture_offset;
+        mat_data->numTextures = num_mat_textures;
+        memcpy(mat_textures_ptr + mat_texture_offset, mat.textureIdx, sizeof(uint32_t) * num_mat_textures);
+        mat_texture_offset += num_mat_textures;
     }
 
+    int32_t aabb_offset = 0;
     math::AABB *aabbs_src = AssetProcessor::makeAABBs(src_objs);
-
-    ShaderAABB *shader_aabbs_src = (ShaderAABB *)malloc(
-            sizeof(ShaderAABB) * src_objs.size());
-
-    for (int i = 0; i < src_objs.size(); ++i) {
-        math::AABB *src = aabbs_src + i;
-        ShaderAABB *current = shader_aabbs_src + i;
-
-        current->data[0].x = src->pMin.x;
-        current->data[0].y = src->pMin.y;
-        current->data[0].z = src->pMin.z;
-
-        current->data[0].w = src->pMax.x;
-        current->data[1].x = src->pMax.y;
-        current->data[1].y = src->pMax.z;
+    for (int i = 0; i < num_total_objs; ++i) {
+        math::AABB *aabb = aabbs_src + i;
+        ShaderAABB *shader_aabb = aabbs_ptr + (aabb_offset++);
+        shader_aabb->data[0] = Vector4 { aabb->pMin.x, aabb->pMin.y, aabb->pMin.z, aabb->pMax.x };
+        shader_aabb->data[1] = Vector4 { aabb->pMax.y, aabb->pMax.z, 0.f, 0.f };
     }
-
-    memcpy(aabbs_ptr, shader_aabbs_src, sizeof(ShaderAABB) * src_objs.size());
-
     free(aabbs_src);
 
     staging.flush(dev);
-
     LocalBuffer asset_buffer = *alloc.makeLocalBuffer(num_asset_bytes);
     GPURunUtil gpu_run {
         load_cmd_pool_,
@@ -2178,9 +2090,7 @@ CountT RenderContext::loadObjects(Span<const imp::SourceObject> src_objs,
         .size = (VkDeviceSize)num_asset_bytes,
     };
 
-    dev.dt.cmdCopyBuffer(load_cmd_, staging.buffer, asset_buffer.buffer,
-                         1, &buffer_copy);
-    
+    dev.dt.cmdCopyBuffer(load_cmd_, staging.buffer, asset_buffer.buffer, 1, &buffer_copy);
 
     gpu_run.submit(dev);
 
@@ -2248,7 +2158,7 @@ CountT RenderContext::loadObjects(Span<const imp::SourceObject> src_objs,
     VkDescriptorBufferInfo index_set_info;
     index_set_info.buffer = asset_buffer.buffer;
     index_set_info.offset = buffer_offsets[2];
-    index_set_info.range = buffer_offsets[3] - buffer_offsets[2];
+    index_set_info.range = buffer_sizes[3];
 
     desc_updates.push_back({});
     DescHelper::storage(desc_updates[4], index_buffer_set, &index_set_info, 0);
@@ -2274,21 +2184,25 @@ CountT RenderContext::loadObjects(Span<const imp::SourceObject> src_objs,
         material_textures_ = loadTextures(dev, alloc, renderQueue, textures);
     }
 
-    DynArray<VkDescriptorImageInfo> tx_infos(material_textures_.size());
-    for (auto &tx : material_textures_) {
-        tx_infos.push_back({
-                VK_NULL_HANDLE,
-                tx.view,
-                VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
-                });
-    }
-
     if (material_textures_.size()) {
+        DynArray<VkDescriptorImageInfo> tx_infos(material_textures_.size());
+        for (auto &tx : material_textures_) {
+            tx_infos.push_back({
+                .sampler = VK_NULL_HANDLE,
+                .imageView = tx.view,
+                .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
+            });
+        }
         desc_updates.push_back({});
         DescHelper::textures(desc_updates[9], asset_set_mat_tex_, tx_infos.data(), tx_infos.size(), 0);
+        
+        VkDescriptorBufferInfo mat_tx_info;
+        mat_tx_info.buffer = asset_buffer.buffer;
+        mat_tx_info.offset = buffer_offsets[5];
+        mat_tx_info.range = buffer_sizes[6];
 
         desc_updates.push_back({});
-        DescHelper::textures(desc_updates[10], asset_set_tex_compute_, tx_infos.data(), tx_infos.size(), 0);
+        DescHelper::storage(desc_updates[10], asset_set_mat_tex_, &mat_tx_info, 2);
     }
 
     DescHelper::update(dev, desc_updates.data(), desc_updates.size());
diff --git a/src/render/render_ctx.hpp b/src/render/render_ctx.hpp
index 8ad81310..b745ad4c 100644
--- a/src/render/render_ctx.hpp
+++ b/src/render/render_ctx.hpp
@@ -76,7 +76,6 @@ struct RenderContext {
     VkDescriptorSetLayout asset_batch_draw_layout_;
 
     VkDescriptorPool asset_pool_;
-    VkDescriptorSet asset_set_tex_compute_;
     VkDescriptorSet asset_batch_lighting_set_;
 
     // This descriptor set contains information about the sky
diff --git a/src/render/shaders/batch_draw_depth.hlsl b/src/render/shaders/batch_draw_depth.hlsl
index 70b9279c..bdd210e1 100644
--- a/src/render/shaders/batch_draw_depth.hlsl
+++ b/src/render/shaders/batch_draw_depth.hlsl
@@ -47,11 +47,8 @@ float4 vert(in uint vid : SV_VertexID,
     Vertex vert = unpackVertex(vertexDataBuffer[vid]);
     uint instance_id = draw_data.instanceID;
 
-    PerspectiveCameraData view_data =
-        unpackViewData(viewDataBuffer[draw_data.viewID]);
-
-    EngineInstanceData instance_data = unpackEngineInstanceData(
-        engineInstanceBuffer[instance_id]);
+    PerspectiveCameraData view_data = unpackViewData(viewDataBuffer[draw_data.viewID]);
+    EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[instance_id]);
 
     float3 to_view_translation;
     float4 to_view_rotation;
@@ -96,13 +93,9 @@ struct PixelOutput {
 };
 
 [shader("pixel")]
-PixelOutput frag(in V2F v2f,
-          in uint prim_id : SV_PrimitiveID)
+PixelOutput frag(in V2F v2f, in uint prim_id : SV_PrimitiveID)
 {
     PixelOutput output;
-
-    output.depthOut = length(v2f.vsCoord) + 
-            min(0.0, abs(materialBuffer[0].color.x));
-
+    output.depthOut = length(v2f.vsCoord) + min(0.0, abs(materialBuffer[0].color.x));
     return output;
 }
diff --git a/src/render/shaders/batch_draw_rgb.hlsl b/src/render/shaders/batch_draw_rgb.hlsl
index 540753d5..377d60f5 100644
--- a/src/render/shaders/batch_draw_rgb.hlsl
+++ b/src/render/shaders/batch_draw_rgb.hlsl
@@ -49,6 +49,9 @@ Texture2D<float4> materialTexturesArray[];
 [[vk::binding(1, 3)]]
 SamplerState linearSampler;
 
+[[vk::binding(2, 3)]]
+StructuredBuffer<int> materialTexturesIndices;
+
 [[vk::binding(0, 4)]]
 Texture2D<float2> shadowMapTextures[];
 
@@ -61,11 +64,10 @@ struct V2F {
     [[vk::location(1)]] float3 worldPos : TEXCOORD0;
     [[vk::location(2)]] float2 uv : TEXCOORD1;
     [[vk::location(3)]] int materialIdx : TEXCOORD2;
-    [[vk::location(4)]] uint color : TEXCOORD3;
-    [[vk::location(5)]] float3 worldNormal : TEXCOORD4;
-    [[vk::location(6)]] uint worldIdx : TEXCOORD5;
-    [[vk::location(7)]] uint viewIdx : TEXCOORD6;
-    [[vk::location(8)]] uint objectIdx : TEXCOORD7;
+    [[vk::location(4)]] float3 worldNormal : TEXCOORD3;
+    [[vk::location(5)]] uint worldIdx : TEXCOORD4;
+    [[vk::location(6)]] uint viewIdx : TEXCOORD5;
+    [[vk::location(7)]] uint objectIdx : TEXCOORD6;
 };
 
 
@@ -78,22 +80,18 @@ void vert(in uint vid : SV_VertexID,
 
     Vertex vert = unpackVertex(vertexDataBuffer[vid]);
     uint instance_id = draw_data.instanceID;
-
-    PerspectiveCameraData view_data =
-        unpackViewData(viewDataBuffer[draw_data.viewID]);
-
-    EngineInstanceData instance_data = unpackEngineInstanceData(
-        engineInstanceBuffer[instance_id]);
+    PerspectiveCameraData view_data = unpackViewData(viewDataBuffer[draw_data.viewID]);
+    EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[instance_id]);
 
     float3 to_view_translation;
     float4 to_view_rotation;
-    computeCompositeTransform(instance_data.position, instance_data.rotation,
+    computeCompositeTransform(
+        instance_data.position, instance_data.rotation,
         view_data.pos, view_data.rot,
-        to_view_translation, to_view_rotation);
+        to_view_translation, to_view_rotation
+    );
 
-    float3 view_pos =
-        rotateVec(to_view_rotation, instance_data.scale * vert.position) +
-            to_view_translation;
+    float3 view_pos = rotateVec(to_view_rotation, instance_data.scale * vert.position) + to_view_translation;
     float z_far = view_data.zFar;
     float z_near = view_data.zNear;
     float4 clip_pos = float4(
@@ -115,25 +113,21 @@ void vert(in uint vid : SV_VertexID,
 
     v2f.worldPos = rotateVec(instance_data.rotation, instance_data.scale * vert.position) + instance_data.position;
     v2f.position = clip_pos;
-    v2f.uv = vert.uv;
+    v2f.uv = float2(vert.uv.x, 1.0f - vert.uv.y);
     v2f.worldNormal = rotateVec(instance_data.rotation, vert.normal);
     v2f.worldIdx = instance_data.worldID;
     v2f.viewIdx = draw_data.viewID;
     v2f.objectIdx = instance_data.objectID;
 
-    if (instance_data.matID == -2) {
-        v2f.materialIdx = -2;
-        v2f.color = instance_data.color;
-    } else if (instance_data.matID == -1) {
+    if (instance_data.matID == -1) {
         v2f.materialIdx = meshDataBuffer[draw_data.meshID].materialIndex;
-        v2f.color = 0;
     } else {
         v2f.materialIdx = instance_data.matID;
-        v2f.color = 0;
     }
 }
 
-float calculateLightAttenuating(ShaderLightData light, float3 worldPos) {
+float calculateLightAttenuating(ShaderLightData light, float3 worldPos)
+{
     if (light.isDirectional) { // Directional light
         return 1.0f;
     } else { // Spot light
@@ -142,7 +136,8 @@ float calculateLightAttenuating(ShaderLightData light, float3 worldPos) {
     }
 }
 
-float3 calculateRayDirection(ShaderLightData light, float3 worldPos) {
+float3 calculateRayDirection(ShaderLightData light, float3 worldPos)
+{
     if (light.isDirectional) { // Directional light
         return normalize(light.direction.xyz);
     } else { // Spot light
@@ -157,7 +152,8 @@ float3 calculateRayDirection(ShaderLightData light, float3 worldPos) {
     }
 }
 
-float4 getShadowMapPixelScaleOffset(uint view_idx, uint2 shadow_map_dim) {
+float4 getShadowMapPixelScaleOffset(uint view_idx, uint2 shadow_map_dim)
+{
     uint num_views_per_image = pushConst.maxShadowMapsXPerTarget * 
                                pushConst.maxShadowMapsYPerTarget;
 
@@ -174,7 +170,8 @@ float4 getShadowMapPixelScaleOffset(uint view_idx, uint2 shadow_map_dim) {
     return float4(scale, offset);
 }
 
-float linear_step(float low, float high, float v) {
+float linear_step(float low, float high, float v)
+{
     return clamp((v - low) / (high - low), 0, 1);
 }
 
@@ -197,8 +194,7 @@ float samplePCF(uint shadow_map_target_idx, float2 uv, float z)
 float4 calculuateLightSpacePosition(float3 world_pos, uint view_idx)
 {
     float4 world_pos_v4 = float4(world_pos.xyz, 1.f);
-    float4 ls_pos = mul(shadowViewDataBuffer[view_idx].viewProjectionMatrix, 
-                        world_pos_v4);
+    float4 ls_pos = mul(shadowViewDataBuffer[view_idx].viewProjectionMatrix, world_pos_v4);
     ls_pos.xyz /= ls_pos.w;
 
     return ls_pos;
@@ -278,47 +274,47 @@ PixelOutput frag(in V2F v2f, in uint prim_id : SV_PrimitiveID)
         output.rgbOut = float4(0.0, 0.0, 0.0, 1.0);
     }
     else {
+        MaterialData mat_data = materialBuffer[v2f.materialIdx];
+        float4 color = mat_data.color;
+
+        int texture_idx = -1;
+        uint texture_count = mat_data.textureCount;
+        if (texture_count > 0) {
+            uint texture_start = mat_data.textureOffset;
+            texture_idx = materialTexturesIndices[texture_start + v2f.worldIdx % texture_count];
+        }
+        if (texture_idx != -1) {
+            color *= materialTexturesArray[texture_idx].Sample(linearSampler, v2f.uv);
+        }
+
+        float3 totalLighting = float3(0.f, 0.f, 0.f);
+        uint numLights = pushConst.numLights;
+        float shadowFactor = shadowFactorVSM(v2f.worldPos, v2f.viewIdx);
 
-        if (v2f.materialIdx == -2) {
-            output.rgbOut = hexToRgb(v2f.color);
-        } else {
-            MaterialData mat_data = materialBuffer[v2f.materialIdx];
-            float4 color = mat_data.color;
+        [unroll(1)]
+        for (uint i = 0; i < numLights; i++) {
+            ShaderLightData light = unpackLightData(lightDataBuffer[v2f.worldIdx * numLights + i]);
+            if (!light.active) {
+                continue;
+            }
             
-            if (mat_data.textureIdx != -1) {
-                color *= materialTexturesArray[mat_data.textureIdx].Sample(
-                        linearSampler, v2f.uv);
+            float3 ray_dir = calculateRayDirection(light, v2f.worldPos);
+            if (all(ray_dir == float3(0, 0, 0))) {
+                continue;
             }
 
-            float3 totalLighting = float3(0.f, 0.f, 0.f);
-            uint numLights = pushConst.numLights;
-            float shadowFactor = shadowFactorVSM(v2f.worldPos, v2f.viewIdx);
-
-            [unroll(1)]
-            for (uint i = 0; i < numLights; i++) {
-                ShaderLightData light = unpackLightData(lightDataBuffer[v2f.worldIdx * numLights + i]);
-                if (!light.active) {
-                    continue;
-                }
-                
-                float3 ray_dir = calculateRayDirection(light, v2f.worldPos);
-                if (all(ray_dir == float3(0, 0, 0))) {
-                    continue;
-                }
-
-                float n_dot_l = max(0.0, dot(normal, -ray_dir));
-                float attenuating_factor = calculateLightAttenuating(light, v2f.worldPos);
-                totalLighting += attenuating_factor * hexToRgb(light.color).rgb * n_dot_l * light.intensity;
-
-                // Apply shadow to the shadowed light. Only support one shadow per view for now. 
-                if (i == shadowViewDataBuffer[v2f.viewIdx].lightIdx) {
-                    totalLighting *= shadowFactor;
-                }
+            float n_dot_l = max(0.0, dot(normal, -ray_dir));
+            float attenuating_factor = calculateLightAttenuating(light, v2f.worldPos);
+            totalLighting += attenuating_factor * hexToRgb(light.color).rgb * n_dot_l * light.intensity;
+
+            // Apply shadow to the shadowed light. Only support one shadow per view for now. 
+            if (i == shadowViewDataBuffer[v2f.viewIdx].lightIdx) {
+                totalLighting *= shadowFactor;
             }
-            
-            color.rgb = (totalLighting + ambient) * color.rgb;
-            output.rgbOut = color;
         }
+        
+        color.rgb = (totalLighting + ambient) * color.rgb;
+        output.rgbOut = color;
     }
 
     if (renderOptions.outputs[2]) {
diff --git a/src/render/shaders/draw_gbuffer.hlsl b/src/render/shaders/draw_gbuffer.hlsl
index 3181218d..22224574 100644
--- a/src/render/shaders/draw_gbuffer.hlsl
+++ b/src/render/shaders/draw_gbuffer.hlsl
@@ -29,6 +29,9 @@ Texture2D<float4> materialTexturesArray[];
 [[vk::binding(1, 2)]]
 SamplerState linearSampler;
 
+[[vk::binding(2, 2)]]
+StructuredBuffer<int> materialTexturesIndices;
+
 struct V2F {
     [[vk::location(0)]] float3 normal : TEXCOORD0;
     [[vk::location(1)]] float3 position : TEXCOORD1;
@@ -51,11 +54,8 @@ float4 vert(in uint vid : SV_VertexID,
     float4 color = materialBuffer[vert.materialIdx].color;
     uint instance_id = draw_data.instanceID;
 
-    PerspectiveCameraData view_data =
-        unpackViewData(viewDataBuffer[push_const.viewIdx]);
-
-    EngineInstanceData instance_data = unpackEngineInstanceData(
-        engineInstanceBuffer[instance_id]);
+    PerspectiveCameraData view_data =unpackViewData(viewDataBuffer[push_const.viewIdx]);
+    EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[instance_id]);
 
     float3 to_view_translation;
     float4 to_view_rotation;
diff --git a/src/render/shaders/shader_common.h b/src/render/shaders/shader_common.h
index 178e89e6..17136283 100644
--- a/src/render/shaders/shader_common.h
+++ b/src/render/shaders/shader_common.h
@@ -163,13 +163,11 @@ struct MeshData {
 struct MaterialData {
     // For now, just a color
     float4 color;
-
-    int32_t textureIdx;
+    int32_t textureOffset;
+    int32_t numTextures;
 
     float roughness;
     float metalness;
-
-    int32_t pad[1];
 };
 
 struct ObjectData {
@@ -192,7 +190,6 @@ struct EngineInstanceData {
     int32_t matID;
     int32_t objectID;
     int32_t worldID;
-    uint32_t color;
 };
 
 struct PackedViewData {
@@ -275,7 +272,6 @@ struct DrawCmd {
 struct DrawData {
     int instanceID;
     int materialID;
-    uint32_t color;
 };
 
 struct RenderOptions {
diff --git a/src/render/shaders/shader_utils.hlsl b/src/render/shaders/shader_utils.hlsl
index 58183922..5d8b3715 100644
--- a/src/render/shaders/shader_utils.hlsl
+++ b/src/render/shaders/shader_utils.hlsl
@@ -75,7 +75,6 @@ EngineInstanceData unpackEngineInstanceData(PackedInstanceData packed)
     o.matID = asint(d2.z);
     o.objectID = asint(d2.w);
     o.worldID = asint(d3.x);
-    o.color = asuint(d3.y);
 
     return o;
 }
diff --git a/src/render/shaders/viewer_cull.hlsl b/src/render/shaders/viewer_cull.hlsl
index 60b98a3f..e85be5b8 100644
--- a/src/render/shaders/viewer_cull.hlsl
+++ b/src/render/shaders/viewer_cull.hlsl
@@ -79,9 +79,7 @@ void instanceCull(uint3 tid           : SV_DispatchThreadID,
 
         uint current_instance_idx = sm.instancesOffset + local_idx;
 
-        EngineInstanceData instance_data = unpackEngineInstanceData(
-            engineInstanceBuffer[current_instance_idx]);
-
+        EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[current_instance_idx]);
         ObjectData obj = objectDataBuffer[instance_data.objectID];
 
         uint draw_offset;
@@ -105,9 +103,7 @@ void instanceCull(uint3 tid           : SV_DispatchThreadID,
             } else {
                 draw_data.materialID = instance_data.matID;
             }
-
             draw_data.instanceID = current_instance_idx;
-            draw_data.color = instance_data.color;
 
             drawCommandBuffer[draw_id] = draw_cmd;
             drawDataBuffer[draw_id] = draw_data;
diff --git a/src/render/shaders/viewer_draw.hlsl b/src/render/shaders/viewer_draw.hlsl
index 786e5508..9e250a1a 100644
--- a/src/render/shaders/viewer_draw.hlsl
+++ b/src/render/shaders/viewer_draw.hlsl
@@ -35,15 +35,15 @@ Texture2D<float4> materialTexturesArray[];
 [[vk::binding(1, 2)]]
 SamplerState linearSampler;
 
+[[vk::binding(2, 2)]]
+StructuredBuffer<int> materialTexturesIndices;
+
 struct V2F {
     [[vk::location(0)]] float3 normal : TEXCOORD0;
     [[vk::location(1)]] float3 position : TEXCOORD1;
-    [[vk::location(2)]] float4 color : TEXCOORD2;
     [[vk::location(3)]] float dummy : TEXCOORD3;
     [[vk::location(4)]] float2 uv : TEXCOORD4;
-    [[vk::location(5)]] int texIdx : TEXCOORD5;
-    [[vk::location(6)]] float roughness : TEXCOORD6;
-    [[vk::location(7)]] float metalness : TEXCOORD7;
+    [[vk::location(3)]] int materialIdx : TEXCOORD2;
 };
 
 PerspectiveCameraData getCameraData()
@@ -72,24 +72,21 @@ float4 vert(in uint vid : SV_VertexID,
             out V2F v2f) : SV_Position
 {
     DrawData draw_data = drawDataBuffer[draw_id];
-
     Vertex vert = unpackVertex(vertexDataBuffer[vid]);
 
     uint instance_id = draw_data.instanceID;
-    EngineInstanceData instance_data = unpackEngineInstanceData(
-        engineInstanceBuffer[instance_id]);
-
+    EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[instance_id]);
     PerspectiveCameraData view_data = getCameraData();
 
     float3 to_view_translation;
     float4 to_view_rotation;
-    computeCompositeTransform(instance_data.position, instance_data.rotation,
+    computeCompositeTransform(
+        instance_data.position, instance_data.rotation,
         view_data.pos, view_data.rot,
-        to_view_translation, to_view_rotation);
+        to_view_translation, to_view_rotation
+    );
 
-    float3 view_pos =
-        rotateVec(to_view_rotation, instance_data.scale * vert.position) +
-            to_view_translation;
+    float3 view_pos = rotateVec(to_view_rotation, instance_data.scale * vert.position) + to_view_translation;
 
 #if 0
     float4 clip_pos = float4(
@@ -135,34 +132,11 @@ float4 vert(in uint vid : SV_VertexID,
             view_pos.y);
     }
 
-    v2f.normal = normalize(
-        rotateVec(instance_data.rotation, (vert.normal / instance_data.scale)));
-    v2f.uv = vert.uv;
-
-    v2f.position = rotateVec(instance_data.rotation,
-                             instance_data.scale * vert.position) + instance_data.position;
+    v2f.normal = normalize(rotateVec(instance_data.rotation, (vert.normal / instance_data.scale)));
+    v2f.uv = float2(vert.uv.x, 1.0f - vert.uv.y);
+    v2f.position = rotateVec(instance_data.rotation, instance_data.scale * vert.position) + instance_data.position;
     v2f.dummy = shadowViewDataBuffer[0].viewProjectionMatrix[0][0];
-
-
-
-    v2f.texIdx = -1;
-    // Defaults for now
-    v2f.roughness = 0.8;
-    v2f.metalness = 0.2;
-
-    if (draw_data.materialID == -2) {
-        v2f.color = hexToRgb(draw_data.color);
-    } else {
-        int32_t material_id = draw_data.materialID;
-        
-        float4 color = materialBuffer[material_id].color;
-
-        // Material
-        v2f.color = color;
-        v2f.texIdx = materialBuffer[material_id].textureIdx;
-        v2f.roughness = materialBuffer[material_id].roughness;
-        v2f.metalness = materialBuffer[material_id].metalness;
-    }
+    v2f.materialIdx = draw_data.materialID;
 
     return clip_pos;
 }
@@ -177,18 +151,25 @@ struct PixelOutput {
 PixelOutput frag(in V2F v2f)
 {
     PixelOutput output;
-    output.color = v2f.color;
-    output.color.a = v2f.roughness;
-    output.normal = float4(normalize(v2f.normal), 1.f);
-    output.position = float4(v2f.position, v2f.dummy * 0.0000001f);
-    output.position.a += v2f.metalness;
 
-    // output.color.rgb = v2f.normal.xyz;
+    float metalness = mat_data.metalness;
+    float roughness = mat_data.roughness;
+    MaterialData mat_data = materialBuffer[v2f.materialIdx];
+    float4 color = mat_data.color;
 
-    if (v2f.texIdx != -1) {
-        output.color *= materialTexturesArray[v2f.texIdx].SampleLevel(
-            linearSampler, float2(v2f.uv.x, 1.f - v2f.uv.y), 0);
+    int texture_idx = -1;
+    uint texture_count = mat_data.textureCount;
+    if (texture_count > 0) {
+        uint texture_start = mat_data.textureOffset;
+        texture_idx = materialTexturesIndices[texture_start + v2f.worldID % texture_count];
     }
-
+    if (texture_idx != -1) {
+        color *= materialTexturesArray[texture_idx].SampleLevel(linearSampler, v2f.uv, 0);
+    }
+    
+    output.color = color;
+    output.color.a = roughness;
+    output.normal = float4(normalize(v2f.normal), 1.f);
+    output.position = float4(v2f.position, v2f.dummy * 0.0000001f + metalness);
     return output;
 }
diff --git a/src/render/shaders/voxel_draw.hlsl b/src/render/shaders/voxel_draw.hlsl
index 76c96f4e..7b7b78bc 100644
--- a/src/render/shaders/voxel_draw.hlsl
+++ b/src/render/shaders/voxel_draw.hlsl
@@ -23,6 +23,9 @@ Texture2D<float4> materialTexturesArray[];
 [[vk::binding(1, 1)]]
 SamplerState linearSampler;
 
+[[vk::binding(2, 1)]]
+StructuredBuffer<int> materialTexturesIndices;
+
 struct V2F {
     [[vk::location(0)]] float3 normal : TEXCOORD0;
     [[vk::location(1)]] float3 position : TEXCOORD1;

From 4acad9b957151a3c8cd5069f37e7d1a5b0d1d879 Mon Sep 17 00:00:00 2001
From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com>
Date: Thu, 4 Dec 2025 00:50:33 -0500
Subject: [PATCH 02/11] fix(1)

---
 include/madrona/importer.hpp |   2 +-
 src/bridge/bindings.cpp      |  62 ++++++-------
 src/bridge/mgr.cpp           | 171 ++++++++++++++++++++---------------
 src/importer/gltf.cpp        |   4 +-
 src/render/render_ctx.cpp    |  57 ++++--------
 5 files changed, 147 insertions(+), 149 deletions(-)

diff --git a/include/madrona/importer.hpp b/include/madrona/importer.hpp
index 731035f4..9ff36dbd 100644
--- a/include/madrona/importer.hpp
+++ b/include/madrona/importer.hpp
@@ -105,7 +105,7 @@ struct ImportedAssets {
 
     DynArray<SourceObject> objects;
     DynArray<SourceMaterial> materials;
-    DynArray<uint32_t> materialTextures;
+    DynArray<int32_t> materialTextures;
     DynArray<SourceInstance> instances;
     DynArray<SourceTexture> textures;
 };
diff --git a/src/bridge/bindings.cpp b/src/bridge/bindings.cpp
index e5948391..c928e429 100644
--- a/src/bridge/bindings.cpp
+++ b/src/bridge/bindings.cpp
@@ -150,39 +150,39 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
            nb::arg("visualizer_gpu_handles") = nb::none(),
            nb::keep_alive<1, 31>())
         .def("init", [](Manager &mgr,
-                        nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> geom_pos,
-                        nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 4>> geom_rot,
-                        nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> cam_pos,
-                        nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 4>> cam_rot,
-                        nb::ndarray<nb::pytorch, const int32_t, nb::shape<-1, -1>> mat_ids,
-                        // nb::ndarray<nb::pytorch, const uint32_t, nb::shape<-1, -1>> geom_rgb,
-                        nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> geom_sizes,
-                        nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> light_pos,
-                        nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> light_dir,
-                        nb::ndarray<nb::pytorch, const uint32_t, nb::shape<-1, -1>> light_rgb,
-                        nb::ndarray<nb::pytorch, const bool, nb::shape<-1, -1>> light_isdir,
-                        nb::ndarray<nb::pytorch, const bool, nb::shape<-1, -1>> light_castshadow,
-                        nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1>> light_cutoff,
-                        nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1>> light_attenuation,
-                        nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1>> light_intensity)
-
+            nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> geom_pos,
+            nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 4>> geom_rot,
+            nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> cam_pos,
+            nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 4>> cam_rot,
+            nb::ndarray<nb::pytorch, const int32_t, nb::shape<-1, -1>> mat_ids,
+            nb::ndarray<nb::pytorch, const uint32_t, nb::shape<-1, -1>> geom_rgb,
+            nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> geom_sizes,
+            nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> light_pos,
+            nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> light_dir,
+            nb::ndarray<nb::pytorch, const uint32_t, nb::shape<-1, -1>> light_rgb,
+            nb::ndarray<nb::pytorch, const bool, nb::shape<-1, -1>> light_isdir,
+            nb::ndarray<nb::pytorch, const bool, nb::shape<-1, -1>> light_castshadow,
+            nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1>> light_cutoff,
+            nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1>> light_attenuation,
+            nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1>> light_intensity)
         {
             mgr.init(
-                     reinterpret_cast<const math::Vector3 *>(geom_pos.data()),
-                     reinterpret_cast<const math::Quat *>(geom_rot.data()),
-                     reinterpret_cast<const math::Vector3 *>(cam_pos.data()),
-                     reinterpret_cast<const math::Quat *>(cam_rot.data()),
-                     reinterpret_cast<const int32_t *>(mat_ids.data()),
-                    //  reinterpret_cast<const uint32_t *>(geom_rgb.data()),
-                     reinterpret_cast<const math::Diag3x3 *>(geom_sizes.data()),
-                     reinterpret_cast<const math::Vector3 *>(light_pos.data()),
-                     reinterpret_cast<const math::Vector3 *>(light_dir.data()),
-                     reinterpret_cast<const uint32_t *>(light_rgb.data()),
-                     reinterpret_cast<const bool *>(light_isdir.data()),
-                     reinterpret_cast<const bool *>(light_castshadow.data()),
-                     reinterpret_cast<const float *>(light_cutoff.data()),
-                     reinterpret_cast<const float *>(light_attenuation.data()),
-                     reinterpret_cast<const float *>(light_intensity.data()));
+                reinterpret_cast<const math::Vector3 *>(geom_pos.data()) if geom_pos.shape(0) > 0 else nullptr,
+                reinterpret_cast<const math::Quat *>(geom_rot.data()) if geom_rot.shape(0) > 0 else nullptr,
+                reinterpret_cast<const math::Vector3 *>(cam_pos.data()) if cam_pos.shape(0) > 0 else nullptr,
+                reinterpret_cast<const math::Quat *>(cam_rot.data()) if cam_rot.shape(0) > 0 else nullptr,
+                reinterpret_cast<const int32_t *>(mat_ids.data()) if mat_ids.shape(0) > 0 else nullptr,
+                reinterpret_cast<const uint32_t *>(geom_rgb.data()) if geom_rgb.shape(0) > 0 else nullptr,
+                reinterpret_cast<const math::Diag3x3 *>(geom_sizes.data()) if geom_sizes.shape(0) > 0 else nullptr,
+                reinterpret_cast<const math::Vector3 *>(light_pos.data()) if light_pos.shape(0) > 0 else nullptr,
+                reinterpret_cast<const math::Vector3 *>(light_dir.data()) if light_dir.shape(0) > 0 else nullptr,
+                reinterpret_cast<const uint32_t *>(light_rgb.data()) if light_rgb.shape(0) > 0 else nullptr,
+                reinterpret_cast<const bool *>(light_isdir.data()) if light_isdir.shape(0) > 0 else nullptr,
+                reinterpret_cast<const bool *>(light_castshadow.data()) if light_castshadow.shape(0) > 0 else nullptr,
+                reinterpret_cast<const float *>(light_cutoff.data()) if light_cutoff.shape(0) > 0 else nullptr,
+                reinterpret_cast<const float *>(light_attenuation.data()) if light_attenuation.shape(0) > 0 else nullptr,
+                reinterpret_cast<const float *>(light_intensity.data()) if light_intensity.shape(0) > 0 else nullptr,
+            );
         })
         .def("render", [](Manager &mgr,
             nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1, 3>> geom_pos,
diff --git a/src/bridge/mgr.cpp b/src/bridge/mgr.cpp
index 167bb256..cbc26392 100644
--- a/src/bridge/mgr.cpp
+++ b/src/bridge/mgr.cpp
@@ -154,26 +154,33 @@ struct Manager::Impl {
                                  Quat *cam_rotations,
                                  cudaStream_t strm)
     {
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::InstancePositions),
-            geom_positions,
-            sizeof(Vector3) * numGeoms * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::InstanceRotations),
-            geom_rotations,
-            sizeof(Quat) * numGeoms * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::CameraPositions),
-            cam_positions,
-            sizeof(Vector3) * numCams * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::CameraRotations),
-            cam_rotations,
-            sizeof(Quat) * numCams * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
+        uint32_t total_geoms = numGeoms * cfg.numWorlds;
+        if (geom_positions != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::InstancePositions),
+                geom_positions, sizeof(Vector3) * total_geoms, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        if (geom_rotations != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::InstanceRotations),
+                geom_rotations, sizeof(Quat) * total_geoms, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        
+        uint32_t total_cams = numCams * cfg.numWorlds;
+        if (cam_positions != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::CameraPositions),
+                cam_positions, sizeof(Vector3) * total_cams, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        if (cam_rotations != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::CameraRotations),
+                cam_rotations, sizeof(Quat) * total_cams, cudaMemcpyDeviceToDevice, strm
+            );
+        }
     }
 
     inline void copyInProperties(
@@ -190,63 +197,77 @@ struct Manager::Impl {
         float *light_intensity,
         cudaStream_t strm)
     {
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::InstanceMatOverrides),
-            mat_overrides,
-            sizeof(MaterialOverride) * numGeoms * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::InstanceColorOverrides),
-            col_overrides,
-            sizeof(ColorOverride) * numGeoms * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::InstanceScales),
-            geom_sizes,
-            sizeof(Diag3x3) * numGeoms * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
+        uint32_t total_geoms = numGeoms * cfg.numWorlds;
+        if (mat_overrides != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::InstanceMatOverrides),
+                mat_overrides, sizeof(MaterialOverride) * total_geoms, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        // TODO: Remove ColorOverride from ECS
+        if (col_overrides != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::InstanceColorOverrides),
+                col_overrides, sizeof(ColorOverride) * total_geoms, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        if (geom_sizes != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::InstanceScales),
+                geom_sizes, sizeof(Diag3x3) * total_geoms, cudaMemcpyDeviceToDevice, strm
+            );
+        }
 
         // Copy light properties to GPU
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::LightPositions),
-            light_pos,
-            sizeof(Vector3) * numLights * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::LightDirections),
-            light_dir,
-            sizeof(Vector3) * numLights * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::LightColors),
-            light_color,
-            sizeof(ColorOverride) * numLights * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::LightTypes),
-            light_isdir,
-            sizeof(bool) * numLights * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::LightShadows),
-            light_castshadow,
-            sizeof(bool) * numLights * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::LightCutoffAngles),
-            light_cutoff,
-            sizeof(float) * numLights * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::LightAttenuations),
-            light_attenuation,
-            sizeof(float) * numLights * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
-        cudaMemcpyAsync(
-            gpuExec.getExported((CountT)ExportID::LightIntensities),
-            light_intensity,
-            sizeof(float) * numLights * cfg.numWorlds,
-            cudaMemcpyDeviceToDevice, strm);
+        uint32_t total_lights = numLights * cfg.numWorlds;
+        if (light_pos != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::LightPositions),
+                light_pos, sizeof(Vector3) * total_lights, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        if (light_dir != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::LightDirections),
+                light_dir, sizeof(Vector3) * total_lights, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        if (light_color != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::LightColors),
+                light_color, sizeof(ColorOverride) * total_lights, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        if (light_isdir != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::LightTypes),
+                light_isdir, sizeof(bool) * total_lights, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        if (light_castshadow != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::LightShadows),
+                light_castshadow, sizeof(bool) * total_lights, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        if (light_cutoff != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::LightCutoffAngles),
+                light_cutoff, sizeof(float) * total_lights, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        if (light_attenuation != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::LightAttenuations),
+                light_attenuation, sizeof(float) * total_lights, cudaMemcpyDeviceToDevice, strm
+            );
+        }
+        if (light_intensity != nullptr) {
+            cudaMemcpyAsync(
+                gpuExec.getExported((CountT)ExportID::LightIntensities),
+                light_intensity, sizeof(float) * total_lights, cudaMemcpyDeviceToDevice, strm
+            );
+        }
     }
 
     inline void init(const Vector3 *geom_positions,
diff --git a/src/importer/gltf.cpp b/src/importer/gltf.cpp
index 1532be5e..c7b656cf 100644
--- a/src/importer/gltf.cpp
+++ b/src/importer/gltf.cpp
@@ -1811,10 +1811,10 @@ static bool gltfImportAssets(LoaderData &loader,
         int32_t texture_id = material.baseColorIdx;
         if (texture_id != -1) {
             texture_id += prev_tex_idx;
-            imported.materialTextures.push_back(uint32_t(texture_id));
+            imported.materialTextures.push_back(texture_id);
             imported.materials.emplace_back(SourceMaterial {
                 .color = material.baseColor,
-                .textureIdx = &imported.materialTextures[imported.materialTextures.size() - 1],
+                .textureIdx = imported.materialTextures.data() + (imported.materialTextures.size() - 1),
                 .numTextures = 1,
                 .roughness = material.roughness,
                 .metalness = material.metallic,
diff --git a/src/render/render_ctx.cpp b/src/render/render_ctx.cpp
index d190e842..21b7d435 100644
--- a/src/render/render_ctx.cpp
+++ b/src/render/render_ctx.cpp
@@ -55,7 +55,7 @@ using namespace vk;
 using Vertex = render::shader::Vertex;
 using PackedVertex = render::shader::PackedVertex;
 using MeshData = render::shader::MeshData;
-using MaterialDataShader = render::shader::MaterialData;
+using MaterialData = render::shader::MaterialData;
 using ObjectData = render::shader::ObjectData;
 using DrawPushConst = render::shader::DrawPushConst;
 using CullPushConst = render::shader::CullPushConst;
@@ -635,11 +635,8 @@ static EngineInterop setupEngineInterop(Device &dev,
             world_ids_views_base = malloc(sizeof(uint64_t) * num_worlds * max_views_per_world);
         } else {
 #ifdef MADRONA_VK_CUDA_SUPPORT
-            views_gpu = alloc.makeDedicatedBuffer(
-                num_views_bytes, false, true);
-            views_cuda.emplace(dev, views_gpu->mem,
-                num_views_bytes);
-
+            views_gpu = alloc.makeDedicatedBuffer(num_views_bytes, false, true);
+            views_cuda.emplace(dev, views_gpu->mem, num_views_bytes);
             views_hdl = views_gpu->buf.buffer;
             views_base = (char *)views_cuda->getDevicePointer();
 #endif
@@ -657,12 +654,8 @@ static EngineInterop setupEngineInterop(Device &dev,
             instances_base = malloc(sizeof(render::shader::PackedInstanceData) * num_worlds * max_instances_per_world);
         } else {
 #ifdef MADRONA_VK_CUDA_SUPPORT
-            instances_gpu = alloc.makeDedicatedBuffer(
-                num_instances_bytes, false, true);
-
-            instances_cuda.emplace(dev, instances_gpu->mem,
-                num_instances_bytes);
-
+            instances_gpu = alloc.makeDedicatedBuffer(num_instances_bytes, false, true);
+            instances_cuda.emplace(dev, instances_gpu->mem, num_instances_bytes);
             instances_hdl = instances_gpu->buf.buffer;
             instances_base = (char *)instances_cuda->getDevicePointer();
 #endif
@@ -678,15 +671,10 @@ static EngineInterop setupEngineInterop(Device &dev,
             lights_cpu = alloc.makeStagingBuffer(num_lights_bytes);
             lights_hdl = lights_cpu->buffer;
             lights_base = malloc(sizeof(render::shader::PackedLightData) * num_worlds * max_lights_per_world);
-        }
-        else
-        {
+        } else {
 #ifdef MADRONA_VK_CUDA_SUPPORT
-            lights_gpu = alloc.makeDedicatedBuffer(
-                num_lights_bytes, false, true);
-            lights_cuda.emplace(dev, lights_gpu->mem,
-                num_lights_bytes);
-
+            lights_gpu = alloc.makeDedicatedBuffer(num_lights_bytes, false, true);
+            lights_cuda.emplace(dev, lights_gpu->mem, num_lights_bytes);
             lights_hdl = lights_gpu->buf.buffer;
             lights_base = (char *)lights_cuda->getDevicePointer();
 #endif
@@ -703,11 +691,8 @@ static EngineInterop setupEngineInterop(Device &dev,
             aabb_base = malloc(sizeof(render::shader::AABB) * num_worlds * max_instances_per_world);
         } else {
 #ifdef MADRONA_VK_CUDA_SUPPORT
-            aabb_gpu = alloc.makeDedicatedBuffer(
-                num_aabb_bytes, false, true);
-            aabb_cuda.emplace(dev, aabb_gpu->mem,
-                num_aabb_bytes);
-
+            aabb_gpu = alloc.makeDedicatedBuffer(num_aabb_bytes, false, true);
+            aabb_cuda.emplace(dev, aabb_gpu->mem, num_aabb_bytes);
             aabb_hdl = aabb_gpu->buf.buffer;
             aabb_base = (char *)aabb_cuda->getDevicePointer();
 #endif
@@ -723,12 +708,8 @@ static EngineInterop setupEngineInterop(Device &dev,
             instance_offsets_base = instance_offsets_cpu->ptr;
         } else {
 #ifdef MADRONA_VK_CUDA_SUPPORT
-            instance_offsets_gpu = alloc.makeDedicatedBuffer(
-                num_offsets_bytes, false, true);
-
-            instance_offsets_cuda.emplace(dev, instance_offsets_gpu->mem,
-                num_offsets_bytes);
-
+            instance_offsets_gpu = alloc.makeDedicatedBuffer(num_offsets_bytes, false, true);
+            instance_offsets_cuda.emplace(dev, instance_offsets_gpu->mem, num_offsets_bytes);
             instance_offsets_hdl = instance_offsets_gpu->buf.buffer;
             instance_offsets_base = (char *)instance_offsets_cuda->getDevicePointer();
 #endif
@@ -769,8 +750,7 @@ static EngineInterop setupEngineInterop(Device &dev,
         }
     }
 
-    const uint32_t num_voxels = voxel_config.xLength
-        * voxel_config.yLength * voxel_config.zLength;
+    const uint32_t num_voxels = voxel_config.xLength * voxel_config.yLength * voxel_config.zLength;
     const uint32_t staging_size = num_voxels > 0 ? num_voxels * sizeof(int32_t) : 4;
 
     auto voxel_cpu = Optional<HostBuffer>::none();
@@ -804,8 +784,7 @@ static EngineInterop setupEngineInterop(Device &dev,
     AtomicU32 *total_num_lights_cpu_inc = nullptr;
 
     if (!gpu_input) {
-        total_num_views_readback = (uint32_t *)malloc(
-            3*sizeof(uint32_t));
+        total_num_views_readback = (uint32_t *)malloc(3*sizeof(uint32_t));
         total_num_instances_readback = total_num_views_readback + 1;
         total_num_lights_readback = total_num_instances_readback + 1;
 
@@ -855,10 +834,8 @@ static EngineInterop setupEngineInterop(Device &dev,
         gpu_bridge = nullptr;
     } else {
 #ifdef MADRONA_VK_CUDA_SUPPORT
-        gpu_bridge = (const RenderECSBridge *)cu::allocGPU(
-            sizeof(RenderECSBridge));
-        cudaMemcpy((void *)gpu_bridge, &bridge, sizeof(RenderECSBridge),
-                   cudaMemcpyHostToDevice);
+        gpu_bridge = (const RenderECSBridge *)cu::allocGPU(sizeof(RenderECSBridge));
+        cudaMemcpy((void *)gpu_bridge, &bridge, sizeof(RenderECSBridge), cudaMemcpyHostToDevice);
 #endif
     }
 
@@ -2041,7 +2018,7 @@ CountT RenderContext::loadObjects(Span<const imp::SourceObject> src_objs,
                 float enc_uvs = std::bit_cast<float>(packHalf2x16(uv));
                 PackedVertex *vertex_data = vertex_ptr + (vertex_offset++);
                 vertex_data->data[0] = Vector4 { pos.x, pos.y, pos.z, enc_nt.x };
-                vertex_data->data[1] = Vector4 { enc_nt.y, enc_nt.z, enc_uv, 0 };
+                vertex_data->data[1] = Vector4 { enc_nt.y, enc_nt.z, enc_uvs, 0 };
             }
 
             memcpy(indices_ptr + index_offset, mesh.indices, sizeof(uint32_t) * num_mesh_indices);

From 8148376a0200795ba5a56ff78e6ce9926d843060 Mon Sep 17 00:00:00 2001
From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com>
Date: Thu, 4 Dec 2025 01:25:46 -0500
Subject: [PATCH 03/11] fix(2)

---
 src/bridge/bindings.cpp   |  1 -
 src/bridge/mgr.cpp        | 57 ++++++++++++++++++++-------------------
 src/bridge/mgr.hpp        |  1 -
 src/render/render_ctx.cpp |  4 +--
 4 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/src/bridge/bindings.cpp b/src/bridge/bindings.cpp
index c928e429..34870d35 100644
--- a/src/bridge/bindings.cpp
+++ b/src/bridge/bindings.cpp
@@ -114,7 +114,6 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
                 viz_gpu_hdls != nullptr ? *viz_gpu_hdls : Optional<VisualizerGPUHandles>::none()
             );
 
-            free(ptr_geom_rgba);
             free(ptr_geom_mat_ids);
             free(ptr_geom_data_ids);
         }, nb::arg("gpu_id"),
diff --git a/src/bridge/mgr.cpp b/src/bridge/mgr.cpp
index cbc26392..7cfcca0a 100644
--- a/src/bridge/mgr.cpp
+++ b/src/bridge/mgr.cpp
@@ -167,7 +167,7 @@ struct Manager::Impl {
                 geom_rotations, sizeof(Quat) * total_geoms, cudaMemcpyDeviceToDevice, strm
             );
         }
-        
+
         uint32_t total_cams = numCams * cfg.numWorlds;
         if (cam_positions != nullptr) {
             cudaMemcpyAsync(
@@ -525,33 +525,33 @@ static RTAssets loadRenderObjects(
     }
 
     // Create materials for geoms that do not have one assigned
-    for (CountT i = 0; i < model.numGeoms; i++) {
-        if (model.geomMatIDs[i] == -1) {
-            const math::Vector4 &rgba_i = model.geomRGBA[i];
-            SourceMaterial mat = {
-                .color = math::Vector4{rgba_i.x, rgba_i.y, rgba_i.z, rgba_i.w},
-                .textureIdx = nullptr,
-                .numTextures = 0,
-                .roughness = 0.8f,
-                .metalness = 0.2f,
-            };
-            materials.push_back(mat);
-            model.geomMatIDs[i] = materials.size() - 1;
-
-            for (CountT j = i + 1; j < model.numGeoms; j++) {
-                // FIX: Should probably implement == op for Vector4
-                const math::Vector4 &rgba_j = model.geomRGBA[j];
-                if (model.geomMatIDs[j] == -1 && 
-                    rgba_i.x == rgba_j.x &&
-                    rgba_i.y == rgba_j.y &&
-                    rgba_i.z == rgba_j.z &&
-                    rgba_i.w == rgba_j.w
-                ) {
-                    model.geomMatIDs[j] = materials.size() - 1;
-                }
-            }
-        }
-    }
+    // for (CountT i = 0; i < model.numGeoms; i++) {
+    //     if (model.geomMatIDs[i] == -1) {
+    //         const math::Vector4 &rgba_i = model.geomRGBA[i];
+    //         SourceMaterial mat = {
+    //             .color = math::Vector4{rgba_i.x, rgba_i.y, rgba_i.z, rgba_i.w},
+    //             .textureIdx = nullptr,
+    //             .numTextures = 0,
+    //             .roughness = 0.8f,
+    //             .metalness = 0.2f,
+    //         };
+    //         materials.push_back(mat);
+    //         model.geomMatIDs[i] = materials.size() - 1;
+
+    //         for (CountT j = i + 1; j < model.numGeoms; j++) {
+    //             // FIX: Should probably implement == op for Vector4
+    //             const math::Vector4 &rgba_j = model.geomRGBA[j];
+    //             if (model.geomMatIDs[j] == -1 && 
+    //                 rgba_i.x == rgba_j.x &&
+    //                 rgba_i.y == rgba_j.y &&
+    //                 rgba_i.z == rgba_j.z &&
+    //                 rgba_i.w == rgba_j.w
+    //             ) {
+    //                 model.geomMatIDs[j] = materials.size() - 1;
+    //             }
+    //         }
+    //     }
+    // }
 
     HeapArray<SourceObject> objs(model.numGeoms + 1);
 
@@ -605,6 +605,7 @@ static RTAssets loadRenderObjects(
                 .numFaces = source_mesh.numFaces,
                 .materialIDX = static_cast<uint32_t>(model.geomMatIDs[i]),
             };
+            assert(model.geomMatIDs[i] >= 0);
         }
 
         objs[i] = {
diff --git a/src/bridge/mgr.hpp b/src/bridge/mgr.hpp
index a887969d..a70b0062 100644
--- a/src/bridge/mgr.hpp
+++ b/src/bridge/mgr.hpp
@@ -39,7 +39,6 @@ struct GSModel {
     int32_t *geomMatIDs;
     int32_t *enabledGeomGroups;
     madrona::math::Vector3 *geomSizes;
-    madrona::math::Vector4 *geomRGBA;
     madrona::math::Vector4 *matRGBA;
     int32_t *matTexIDs;
     int32_t *matTexOffsets;
diff --git a/src/render/render_ctx.cpp b/src/render/render_ctx.cpp
index 21b7d435..ffb0e476 100644
--- a/src/render/render_ctx.cpp
+++ b/src/render/render_ctx.cpp
@@ -55,7 +55,7 @@ using namespace vk;
 using Vertex = render::shader::Vertex;
 using PackedVertex = render::shader::PackedVertex;
 using MeshData = render::shader::MeshData;
-using MaterialData = render::shader::MaterialData;
+using MaterialDataShader = render::shader::MaterialData;
 using ObjectData = render::shader::ObjectData;
 using DrawPushConst = render::shader::DrawPushConst;
 using CullPushConst = render::shader::CullPushConst;
@@ -2030,7 +2030,7 @@ CountT RenderContext::loadObjects(Span<const imp::SourceObject> src_objs,
     int32_t mat_texture_offset = 0;
     for (const SourceMaterial &mat : src_mats) {
         int32_t num_mat_textures = (int32_t)mat.numTextures;
-        MaterialData *mat_data = materials_ptr + (mat_offset++);
+        MaterialDataShader *mat_data = materials_ptr + (mat_offset++);
         mat_data->color = mat.color;
         mat_data->roughness = mat.roughness;
         mat_data->metalness = mat.metalness;

From 625fbefe8202dcb625a8dd10427e01fad6ea2934 Mon Sep 17 00:00:00 2001
From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com>
Date: Thu, 4 Dec 2025 01:32:17 -0500
Subject: [PATCH 04/11] fix(3)

---
 src/bridge/bindings.cpp        |  2 +-
 src/render/asset_processor.cpp | 42 ++++++++++++----------------------
 src/render/batch_renderer.cpp  | 37 +++++++++++++-----------------
 3 files changed, 31 insertions(+), 50 deletions(-)

diff --git a/src/bridge/bindings.cpp b/src/bridge/bindings.cpp
index 34870d35..acbeeb18 100644
--- a/src/bridge/bindings.cpp
+++ b/src/bridge/bindings.cpp
@@ -180,7 +180,7 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
                 reinterpret_cast<const bool *>(light_castshadow.data()) if light_castshadow.shape(0) > 0 else nullptr,
                 reinterpret_cast<const float *>(light_cutoff.data()) if light_cutoff.shape(0) > 0 else nullptr,
                 reinterpret_cast<const float *>(light_attenuation.data()) if light_attenuation.shape(0) > 0 else nullptr,
-                reinterpret_cast<const float *>(light_intensity.data()) if light_intensity.shape(0) > 0 else nullptr,
+                reinterpret_cast<const float *>(light_intensity.data()) if light_intensity.shape(0) > 0 else nullptr
             );
         })
         .def("render", [](Manager &mgr,
diff --git a/src/render/asset_processor.cpp b/src/render/asset_processor.cpp
index 16536051..a2206474 100644
--- a/src/render/asset_processor.cpp
+++ b/src/render/asset_processor.cpp
@@ -330,14 +330,12 @@ MaterialData initMaterialData(
     }
 
     MaterialData cpu_mat_data = {
-        .textures = (cudaTextureObject_t *)
-            malloc(sizeof(cudaTextureObject_t) * num_textures),
-        .textureBuffers = (cudaArray_t *)
-            malloc(sizeof(cudaArray_t) * num_non_mipmap_textures),
-        .mipmapTextureBuffers = (cudaMipmappedArray_t *)
-            malloc(sizeof(cudaMipmappedArray_t) * num_mipmap_textures),
-        .materials = (Material *)
-            malloc(sizeof(Material) * num_materials)
+        .textures = (cudaTextureObject_t *) malloc(sizeof(cudaTextureObject_t) * num_textures),
+        .textureBuffers = (cudaArray_t *) malloc(sizeof(cudaArray_t) * num_non_mipmap_textures),
+        .numTextureBuffers = num_non_mipmap_textures,
+        .mipmapTextureBuffers = (cudaMipmappedArray_t *) malloc(sizeof(cudaMipmappedArray_t) * num_mipmap_textures),
+        .numMipmapTextureBuffers = num_mipmap_textures,
+        .materials = (Material *) malloc(sizeof(Material) * num_materials)
     };
 
     for (uint32_t i = 0; i < num_textures; ++i) {
@@ -349,12 +347,9 @@ MaterialData initMaterialData(
             width = tex.width;
             height = tex.height;
 
-            cudaChannelFormatDesc channel_desc =
-                    cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>();
-
+            cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>();
             cudaArray_t cuda_array;
-            REQ_CUDA(cudaMallocArray(&cuda_array, &channel_desc,
-                                     width, height, cudaArrayDefault));
+            REQ_CUDA(cudaMallocArray(&cuda_array, &channel_desc, width, height, cudaArrayDefault));
 
             REQ_CUDA(cudaMemcpy2DToArray(cuda_array, 0, 0, tex.data,
                                          16 * width / 4,
@@ -375,8 +370,7 @@ MaterialData initMaterialData(
             tex_desc.sRGB = 1;
 
             cudaTextureObject_t tex_obj = 0;
-            REQ_CUDA(cudaCreateTextureObject(&tex_obj,
-                                             &res_desc, &tex_desc, nullptr));
+            REQ_CUDA(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, nullptr));
 
             cpu_mat_data.textures[i] = tex_obj;
             cpu_mat_data.textureBuffers[i] = cuda_array;
@@ -443,13 +437,10 @@ MaterialData initMaterialData(
         }
     }
 
-    cpu_mat_data.numTextureBuffers = num_non_mipmap_textures;
-    cpu_mat_data.numMipmapTextureBuffers = num_mipmap_textures;
-
     for (uint32_t i = 0; i < num_materials; ++i) {
         Material mat = {
             .color = materials[i].color,
-            .textureIdx = materials[i].textureIdx,
+            .textureIdx = materials[i].numTextures > 0 ? *(materials[i].textureIdx) : -1, //TODO: support batch
             .roughness = materials[i].roughness,
             .metalness = materials[i].metalness,
         };
@@ -458,15 +449,13 @@ MaterialData initMaterialData(
     }
 
     cudaTextureObject_t *gpu_tex_buffer;
-    REQ_CUDA(cudaMalloc(&gpu_tex_buffer, 
-                sizeof(cudaTextureObject_t) * num_textures));
+    REQ_CUDA(cudaMalloc(&gpu_tex_buffer, sizeof(cudaTextureObject_t) * num_textures));
     REQ_CUDA(cudaMemcpy(gpu_tex_buffer, cpu_mat_data.textures, 
                 sizeof(cudaTextureObject_t) * num_textures,
                 cudaMemcpyHostToDevice));
 
     Material *mat_buffer;
-    REQ_CUDA(cudaMalloc(&mat_buffer, 
-                sizeof(Material) * num_materials));
+    REQ_CUDA(cudaMalloc(&mat_buffer, sizeof(Material) * num_materials));
     REQ_CUDA(cudaMemcpy(mat_buffer, cpu_mat_data.materials, 
                 sizeof(Material) * num_materials,
                 cudaMemcpyHostToDevice));
@@ -484,13 +473,10 @@ MaterialData initMaterialData(
 }
 #endif
 
-math::AABB *makeAABBs(
-        Span<const imp::SourceObject> src_objs)
+math::AABB *makeAABBs(Span<const imp::SourceObject> src_objs)
 {
     int num_objects = (int)src_objs.size();
-
-    math::AABB *aabbs = (math::AABB *)malloc(sizeof(math::AABB) *
-            num_objects);
+    math::AABB *aabbs = (math::AABB *)malloc(sizeof(math::AABB) * num_objects);
 
     for (int obj_idx = 0; obj_idx < num_objects; ++obj_idx) {
         auto &obj = src_objs[obj_idx];
diff --git a/src/render/batch_renderer.cpp b/src/render/batch_renderer.cpp
index 23bb9789..c2de67f6 100644
--- a/src/render/batch_renderer.cpp
+++ b/src/render/batch_renderer.cpp
@@ -327,7 +327,7 @@ static PipelineMP<1> makeDrawPipeline(const vk::Device &dev,
     blend_info.pAttachments = blend_attachments.data();
 
     // Dynamic
-    std::array<VkDynamicState, 2> dyn_enable {{
+    std::array<VkDynamicState, 2> dyn_enable {
         VK_DYNAMIC_STATE_VIEWPORT,
         VK_DYNAMIC_STATE_SCISSOR,
     };
@@ -2043,11 +2043,10 @@ void BatchRenderer::prepareForRendering(BatchRenderInfo info,
     }
 
     { // Import the views
-        VkDeviceSize num_views_bytes = info.numViews *
-            sizeof(shader::PackedViewData);
-
+        VkDeviceSize num_views_bytes = info.numViews * sizeof(shader::PackedViewData);
         VkBufferCopy view_data_copy = {
-            .srcOffset = 0, .dstOffset = 0,
+            .srcOffset = 0,
+            .dstOffset = 0,
             .size = num_views_bytes
         };
 
@@ -2057,11 +2056,10 @@ void BatchRenderer::prepareForRendering(BatchRenderInfo info,
     }
 
     { // Import the instances
-        VkDeviceSize num_instances_bytes = info.numInstances *
-            sizeof(shader::PackedInstanceData);
-
+        VkDeviceSize num_instances_bytes = info.numInstances * sizeof(shader::PackedInstanceData);
         VkBufferCopy instance_data_copy = {
-            .srcOffset = 0, .dstOffset = 0,
+            .srcOffset = 0,
+            .dstOffset = 0,
             .size = num_instances_bytes
         };
 
@@ -2071,11 +2069,10 @@ void BatchRenderer::prepareForRendering(BatchRenderInfo info,
     }
 
     { // Import the offsets for instances
-        VkDeviceSize num_offsets_bytes = info.numWorlds *
-            sizeof(int32_t);
-
+        VkDeviceSize num_offsets_bytes = info.numWorlds * sizeof(int32_t);
         VkBufferCopy offsets_data_copy = {
-            .srcOffset = 0, .dstOffset = 0,
+            .srcOffset = 0,
+            .dstOffset = 0,
             .size = num_offsets_bytes
         };
 
@@ -2086,11 +2083,10 @@ void BatchRenderer::prepareForRendering(BatchRenderInfo info,
 
 #if 0
     { // Import the aabbs for instances
-        VkDeviceSize num_aabbs_bytes = info.numInstances *
-            sizeof(shader::AABB);
-
+        VkDeviceSize num_aabbs_bytes = info.numInstances * sizeof(shader::AABB);
         VkBufferCopy aabb_data_copy = {
-            .srcOffset = 0, .dstOffset = 0,
+            .srcOffset = 0,
+            .dstOffset = 0,
             .size = num_aabbs_bytes
         };
 
@@ -2101,11 +2097,10 @@ void BatchRenderer::prepareForRendering(BatchRenderInfo info,
 #endif
 
     { // Import the offsets for views
-        VkDeviceSize num_offsets_bytes = info.numWorlds *
-            sizeof(int32_t);
-
+        VkDeviceSize num_offsets_bytes = info.numWorlds * sizeof(int32_t);
         VkBufferCopy offsets_data_copy = {
-            .srcOffset = 0, .dstOffset = 0,
+            .srcOffset = 0,
+            .dstOffset = 0,
             .size = num_offsets_bytes
         };
 

From 988e566de775758d9387bea855c4e406ad5f6d0b Mon Sep 17 00:00:00 2001
From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com>
Date: Thu, 4 Dec 2025 02:42:19 -0500
Subject: [PATCH 05/11] fix(4)

---
 src/bridge/bindings.cpp              | 30 ++++++++++++++--------------
 src/bridge/gs_madrona/renderer_gs.py |  4 +---
 src/render/shaders/viewer_draw.hlsl  | 10 ++++++----
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/bridge/bindings.cpp b/src/bridge/bindings.cpp
index acbeeb18..0c55d79c 100644
--- a/src/bridge/bindings.cpp
+++ b/src/bridge/bindings.cpp
@@ -166,21 +166,21 @@ NB_MODULE(_gs_madrona_batch_renderer, m) {
             nb::ndarray<nb::pytorch, const float, nb::shape<-1, -1>> light_intensity)
         {
             mgr.init(
-                reinterpret_cast<const math::Vector3 *>(geom_pos.data()) if geom_pos.shape(0) > 0 else nullptr,
-                reinterpret_cast<const math::Quat *>(geom_rot.data()) if geom_rot.shape(0) > 0 else nullptr,
-                reinterpret_cast<const math::Vector3 *>(cam_pos.data()) if cam_pos.shape(0) > 0 else nullptr,
-                reinterpret_cast<const math::Quat *>(cam_rot.data()) if cam_rot.shape(0) > 0 else nullptr,
-                reinterpret_cast<const int32_t *>(mat_ids.data()) if mat_ids.shape(0) > 0 else nullptr,
-                reinterpret_cast<const uint32_t *>(geom_rgb.data()) if geom_rgb.shape(0) > 0 else nullptr,
-                reinterpret_cast<const math::Diag3x3 *>(geom_sizes.data()) if geom_sizes.shape(0) > 0 else nullptr,
-                reinterpret_cast<const math::Vector3 *>(light_pos.data()) if light_pos.shape(0) > 0 else nullptr,
-                reinterpret_cast<const math::Vector3 *>(light_dir.data()) if light_dir.shape(0) > 0 else nullptr,
-                reinterpret_cast<const uint32_t *>(light_rgb.data()) if light_rgb.shape(0) > 0 else nullptr,
-                reinterpret_cast<const bool *>(light_isdir.data()) if light_isdir.shape(0) > 0 else nullptr,
-                reinterpret_cast<const bool *>(light_castshadow.data()) if light_castshadow.shape(0) > 0 else nullptr,
-                reinterpret_cast<const float *>(light_cutoff.data()) if light_cutoff.shape(0) > 0 else nullptr,
-                reinterpret_cast<const float *>(light_attenuation.data()) if light_attenuation.shape(0) > 0 else nullptr,
-                reinterpret_cast<const float *>(light_intensity.data()) if light_intensity.shape(0) > 0 else nullptr
+                geom_pos.shape(0) > 0 ? reinterpret_cast<const math::Vector3 *>(geom_pos.data()) : nullptr,
+                geom_rot.shape(0) > 0 ? reinterpret_cast<const math::Quat *>(geom_rot.data()) : nullptr,
+                cam_pos.shape(0) > 0 ? reinterpret_cast<const math::Vector3 *>(cam_pos.data()) : nullptr,
+                cam_rot.shape(0) > 0 ? reinterpret_cast<const math::Quat *>(cam_rot.data()) : nullptr,
+                mat_ids.shape(0) > 0 ? reinterpret_cast<const int32_t *>(mat_ids.data()) : nullptr,
+                geom_rgb.shape(0) > 0 ? reinterpret_cast<const uint32_t *>(geom_rgb.data()) : nullptr,
+                geom_sizes.shape(0) > 0 ? reinterpret_cast<const math::Diag3x3 *>(geom_sizes.data()) : nullptr,
+                light_pos.shape(0) > 0 ? reinterpret_cast<const math::Vector3 *>(light_pos.data()) : nullptr,
+                light_dir.shape(0) > 0 ? reinterpret_cast<const math::Vector3 *>(light_dir.data()) : nullptr,
+                light_rgb.shape(0) > 0 ? reinterpret_cast<const uint32_t *>(light_rgb.data()) : nullptr,
+                light_isdir.shape(0) > 0 ? reinterpret_cast<const bool *>(light_isdir.data()) : nullptr,
+                light_castshadow.shape(0) > 0 ? reinterpret_cast<const bool *>(light_castshadow.data()) : nullptr,
+                light_cutoff.shape(0) > 0 ? reinterpret_cast<const float *>(light_cutoff.data()) : nullptr,
+                light_attenuation.shape(0) > 0 ? reinterpret_cast<const float *>(light_attenuation.data()) : nullptr,
+                light_intensity.shape(0) > 0 ? reinterpret_cast<const float *>(light_intensity.data()) : nullptr
             );
         })
         .def("render", [](Manager &mgr,
diff --git a/src/bridge/gs_madrona/renderer_gs.py b/src/bridge/gs_madrona/renderer_gs.py
index b2a9f7fb..377463ea 100644
--- a/src/bridge/gs_madrona/renderer_gs.py
+++ b/src/bridge/gs_madrona/renderer_gs.py
@@ -31,7 +31,6 @@ def __init__(
         geom_retriever: GeomRetriever,
         gpu_id: int,
         num_worlds: int,
-        num_cameras: int,
         num_lights: int,
         cam_fovs_tensor: torch.Tensor,
         cam_znears_tensor: torch.Tensor,
@@ -44,7 +43,7 @@ def __init__(
         assert geom_retriever is not None, "GeomRetriever is required for MadronaBatchRendererAdapter"
         assert gpu_id >= 0, "GPU ID must be greater than or equal to 0"
         assert num_worlds > 0, "Number of worlds must be greater than 0"
-        assert num_cameras > 0, "Must have at least one camera for Madrona to work!"
+        assert cam_fovs_tensor.shape[0] > 0, "Must have at least one camera for Madrona to work!"
         assert batch_render_view_width > 0, "Batch render view width must be greater than 0"
         assert batch_render_view_height > 0, "Batch render view height must be greater than 0"
 
@@ -72,7 +71,6 @@ def __init__(
             gpu_id=gpu_id,
             **geom_args_static,
             num_lights=num_lights,
-            num_cams=num_cameras,
             num_worlds=num_worlds,
             batch_render_view_width=batch_render_view_width,
             batch_render_view_height=batch_render_view_height,
diff --git a/src/render/shaders/viewer_draw.hlsl b/src/render/shaders/viewer_draw.hlsl
index 9e250a1a..ebe1761f 100644
--- a/src/render/shaders/viewer_draw.hlsl
+++ b/src/render/shaders/viewer_draw.hlsl
@@ -41,9 +41,10 @@ StructuredBuffer<int> materialTexturesIndices;
 struct V2F {
     [[vk::location(0)]] float3 normal : TEXCOORD0;
     [[vk::location(1)]] float3 position : TEXCOORD1;
-    [[vk::location(3)]] float dummy : TEXCOORD3;
-    [[vk::location(4)]] float2 uv : TEXCOORD4;
-    [[vk::location(3)]] int materialIdx : TEXCOORD2;
+    [[vk::location(2)]] float dummy : TEXCOORD3;
+    [[vk::location(3)]] float2 uv : TEXCOORD4;
+    [[vk::location(4)]] int materialIdx : TEXCOORD5;
+    [[vk::location(5)]] float worldIdx : TEXCOORD6;
 };
 
 PerspectiveCameraData getCameraData()
@@ -137,6 +138,7 @@ float4 vert(in uint vid : SV_VertexID,
     v2f.position = rotateVec(instance_data.rotation, instance_data.scale * vert.position) + instance_data.position;
     v2f.dummy = shadowViewDataBuffer[0].viewProjectionMatrix[0][0];
     v2f.materialIdx = draw_data.materialID;
+    v2f.worldIdx = instance_data.worldID;
 
     return clip_pos;
 }
@@ -161,7 +163,7 @@ PixelOutput frag(in V2F v2f)
     uint texture_count = mat_data.textureCount;
     if (texture_count > 0) {
         uint texture_start = mat_data.textureOffset;
-        texture_idx = materialTexturesIndices[texture_start + v2f.worldID % texture_count];
+        texture_idx = materialTexturesIndices[texture_start + v2f.worldIdx % texture_count];
     }
     if (texture_idx != -1) {
         color *= materialTexturesArray[texture_idx].SampleLevel(linearSampler, v2f.uv, 0);

From e729ea6254d0556a4cc6db0227029219f755fe6a Mon Sep 17 00:00:00 2001
From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com>
Date: Thu, 4 Dec 2025 03:15:18 -0500
Subject: [PATCH 06/11] Fix(5) remove materialIdx in Vertex

---
 src/render/shaders/batch_draw_rgb.hlsl |   2 +-
 src/render/shaders/draw_gbuffer.hlsl   | 149 -------------------------
 src/render/shaders/shader_common.h     |   1 -
 src/render/shaders/shader_utils.hlsl   |   1 -
 src/render/shaders/viewer_draw.hlsl    |   4 +-
 5 files changed, 3 insertions(+), 154 deletions(-)
 delete mode 100644 src/render/shaders/draw_gbuffer.hlsl

diff --git a/src/render/shaders/batch_draw_rgb.hlsl b/src/render/shaders/batch_draw_rgb.hlsl
index 377d60f5..554bd40f 100644
--- a/src/render/shaders/batch_draw_rgb.hlsl
+++ b/src/render/shaders/batch_draw_rgb.hlsl
@@ -278,7 +278,7 @@ PixelOutput frag(in V2F v2f, in uint prim_id : SV_PrimitiveID)
         float4 color = mat_data.color;
 
         int texture_idx = -1;
-        uint texture_count = mat_data.textureCount;
+        uint texture_count = mat_data.numTextures;
         if (texture_count > 0) {
             uint texture_start = mat_data.textureOffset;
             texture_idx = materialTexturesIndices[texture_start + v2f.worldIdx % texture_count];
diff --git a/src/render/shaders/draw_gbuffer.hlsl b/src/render/shaders/draw_gbuffer.hlsl
deleted file mode 100644
index 22224574..00000000
--- a/src/render/shaders/draw_gbuffer.hlsl
+++ /dev/null
@@ -1,149 +0,0 @@
-#include "shader_utils.hlsl"
-
-[[vk::push_constant]]
-DrawPushConst push_const;
-
-[[vk::binding(0, 0)]]
-StructuredBuffer<PackedViewData> viewDataBuffer;
-
-[[vk::binding(1, 0)]]
-StructuredBuffer<PackedInstanceData> engineInstanceBuffer;
-
-[[vk::binding(2, 0)]]
-StructuredBuffer<DrawData> drawDataBuffer;
-
-[[vk::binding(3, 0)]]
-StructuredBuffer<ShadowViewData> shadowViewDataBuffer;
-
-// Asset descriptor bindings
-[[vk::binding(0, 1)]]
-StructuredBuffer<PackedVertex> vertexDataBuffer;
-
-[[vk::binding(1, 1)]]
-StructuredBuffer<MaterialData> materialBuffer;
-
-// Texture descriptor bindings
-[[vk::binding(0, 2)]]
-Texture2D<float4> materialTexturesArray[];
-
-[[vk::binding(1, 2)]]
-SamplerState linearSampler;
-
-[[vk::binding(2, 2)]]
-StructuredBuffer<int> materialTexturesIndices;
-
-struct V2F {
-    [[vk::location(0)]] float3 normal : TEXCOORD0;
-    [[vk::location(1)]] float3 position : TEXCOORD1;
-    [[vk::location(2)]] float4 color : TEXCOORD2;
-    [[vk::location(3)]] float dummy : TEXCOORD3;
-    [[vk::location(4)]] float2 uv : TEXCOORD4;
-    [[vk::location(5)]] int texIdx : TEXCOORD5;
-    [[vk::location(6)]] float roughness : TEXCOORD6;
-    [[vk::location(7)]] float metalness : TEXCOORD7;
-};
-
-[shader("vertex")]
-float4 vert(in uint vid : SV_VertexID,
-            in uint draw_id : SV_InstanceID,
-            out V2F v2f) : SV_Position
-{
-    DrawData draw_data = drawDataBuffer[draw_id];
-
-    Vertex vert = unpackVertex(vertexDataBuffer[vid]);
-    float4 color = materialBuffer[vert.materialIdx].color;
-    uint instance_id = draw_data.instanceID;
-
-    PerspectiveCameraData view_data =unpackViewData(viewDataBuffer[push_const.viewIdx]);
-    EngineInstanceData instance_data = unpackEngineInstanceData(engineInstanceBuffer[instance_id]);
-
-    float3 to_view_translation;
-    float4 to_view_rotation;
-    computeCompositeTransform(instance_data.position, instance_data.rotation,
-        view_data.pos, view_data.rot,
-        to_view_translation, to_view_rotation);
-
-    float3 view_pos =
-        rotateVec(to_view_rotation, instance_data.scale * vert.position) +
-            to_view_translation;
-
-    float4 clip_pos = float4(
-        view_data.xScale * view_pos.x,
-        view_data.yScale * view_pos.z,
-        view_data.zNear,
-        view_pos.y);
-
-    // v2f.viewPos = view_pos;
-#if 0
-    v2f.normal = normalize(
-        rotateVec(to_view_rotation, (vert.normal / instance_data.scale)));
-#endif
-    v2f.normal = normalize(
-        rotateVec(instance_data.rotation, (vert.normal / instance_data.scale)));
-    v2f.uv = vert.uv;
-    v2f.color = color;
-    v2f.position = rotateVec(instance_data.rotation,
-                             instance_data.scale * vert.position) + instance_data.position;
-    v2f.dummy = shadowViewDataBuffer[0].viewProjectionMatrix[0][0];
-    v2f.texIdx = materialBuffer[vert.materialIdx].textureIdx;
-    v2f.roughness = materialBuffer[vert.materialIdx].roughness;
-    v2f.metalness = materialBuffer[vert.materialIdx].metalness;
-
-    return clip_pos;
-}
-
-struct PixelOutput {
-    float4 color : SV_Target0;
-    float4 normal : SV_Target1;
-    float4 position : SV_Target2;
-};
-
-[shader("pixel")]
-PixelOutput frag(in V2F v2f)
-{
-    PixelOutput output;
-    output.color = v2f.color;
-    output.color.a = v2f.roughness;
-    output.normal = float4(normalize(v2f.normal), 1.f);
-    output.position = float4(v2f.position, v2f.dummy * 0.0000001f);
-    output.position.a += v2f.metalness;
-
-    // output.color.rgb = v2f.normal.xyz;
-
-    if (v2f.texIdx != -1) {
-        output.color *= materialTexturesArray[v2f.texIdx].SampleLevel(
-            linearSampler, float2(v2f.uv.x, 1.f - v2f.uv.y), 0);
-    }
-
-    return output;
-}
-
-#if 0
-DrawInstanceData unpackDrawInstanceData(PackedDrawInstanceData data)
-{
-    const float4 d0 = data.packed[0];
-    const float4 d1 = data.packed[1];
-    const float4 d2 = data.packed[2];
-    const float4 d3 = data.packed[3];
-    const float4 d4 = data.packed[4];
-
-    DrawInstanceData out;
-
-    float3 rot_col0 = d0.xyz;
-    float3 rot_col1 = float3(d0.w, d1.xy);
-    float3 rot_col2 = float3(d1.zw, d2.x);
-
-    out.toViewRot = float3x3(
-        float3(rot_col0.x, rot_col1.x, rot_col2.x),
-        float3(rot_col0.y, rot_col1.y, rot_col2.y),
-        float3(rot_col0.z, rot_col1.z, rot_col2.z),
-    );
-    out.toViewTranslation = d2.yzw;
-    out.objScale = d3.xyz;
-    out.viewIdx = asint(d3.w);
-    out.projScale = d4.xy;
-    out.projZNear = d4.z;
-
-    return out;
-}
-#endif
diff --git a/src/render/shaders/shader_common.h b/src/render/shaders/shader_common.h
index 17136283..6885d881 100644
--- a/src/render/shaders/shader_common.h
+++ b/src/render/shaders/shader_common.h
@@ -144,7 +144,6 @@ struct Vertex {
     float3 normal;
     float4 tangentAndSign;
     float2 uv;
-    uint32_t materialIdx;
 };
 
 struct PackedVertex {
diff --git a/src/render/shaders/shader_utils.hlsl b/src/render/shaders/shader_utils.hlsl
index 5d8b3715..f1427b19 100644
--- a/src/render/shaders/shader_utils.hlsl
+++ b/src/render/shaders/shader_utils.hlsl
@@ -56,7 +56,6 @@ Vertex unpackVertex(PackedVertex packed)
     vert.normal = normal;
     vert.tangentAndSign = tangent_and_sign;
     vert.uv = unpackHalf2x16(asuint(d1.z));
-    vert.materialIdx = asuint(d1.w);
 
     return vert;
 }
diff --git a/src/render/shaders/viewer_draw.hlsl b/src/render/shaders/viewer_draw.hlsl
index ebe1761f..d4fcd1b2 100644
--- a/src/render/shaders/viewer_draw.hlsl
+++ b/src/render/shaders/viewer_draw.hlsl
@@ -154,15 +154,15 @@ PixelOutput frag(in V2F v2f)
 {
     PixelOutput output;
 
+    MaterialData mat_data = materialBuffer[v2f.materialIdx];
     float metalness = mat_data.metalness;
     float roughness = mat_data.roughness;
-    MaterialData mat_data = materialBuffer[v2f.materialIdx];
     float4 color = mat_data.color;
 
     int texture_idx = -1;
     uint texture_count = mat_data.textureCount;
     if (texture_count > 0) {
-        uint texture_start = mat_data.textureOffset;
+        uint texture_start = mat_data.numTextures;
         texture_idx = materialTexturesIndices[texture_start + v2f.worldIdx % texture_count];
     }
     if (texture_idx != -1) {

From 316723b2e8ef16359fbdf998b87b9ad14ae428fc Mon Sep 17 00:00:00 2001
From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com>
Date: Thu, 4 Dec 2025 06:05:16 -0500
Subject: [PATCH 07/11] fix(6) attachment format

---
 src/mw/device/sort_archetype.cpp     |  2 +-
 src/render/batch_renderer.cpp        | 11 ++--
 src/render/render_common.hpp         |  6 --
 src/render/render_ctx.cpp            | 85 +++++++++++-----------------
 src/render/shaders/post_process.hlsl |  2 +-
 src/render/shaders/viewer_draw.hlsl  |  4 +-
 6 files changed, 41 insertions(+), 69 deletions(-)

diff --git a/src/mw/device/sort_archetype.cpp b/src/mw/device/sort_archetype.cpp
index 8a56e6c1..1be54fd8 100644
--- a/src/mw/device/sort_archetype.cpp
+++ b/src/mw/device/sort_archetype.cpp
@@ -168,7 +168,7 @@ struct BlockRadixRankMatchEarlyCountsCustom
                 for (int u = 0; u < WARP_BINS_PER_THREAD; ++u)
                 {
                     int bin = lane + u * WARP_THREADS;
-                    bins[u] = internal::ThreadReduce(warp_histograms[bin], Sum());
+                    bins[u] = internal::ThreadReduce<NUM_PARTS>(warp_histograms[bin], Sum());
                 }
                 CTA_SYNC();
 
diff --git a/src/render/batch_renderer.cpp b/src/render/batch_renderer.cpp
index c2de67f6..28aa7724 100644
--- a/src/render/batch_renderer.cpp
+++ b/src/render/batch_renderer.cpp
@@ -378,15 +378,15 @@ static PipelineMP<1> makeDrawPipeline(const vk::Device &dev,
         },
     }};
     std::array<VkFormat, 3> colorFormats = {
-        InternalConfig::componentFormats[0],
-        InternalConfig::componentFormats[2],
-        InternalConfig::componentFormats[3]
+        InternalConfig::componentAttachFormats[0],
+        InternalConfig::componentAttachFormats[2],
+        InternalConfig::componentAttachFormats[3]
     };
     VkPipelineRenderingCreateInfo rendering_info = {};
     rendering_info.sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO;
     rendering_info.colorAttachmentCount = 3;
     rendering_info.pColorAttachmentFormats = colorFormats.data();
-    rendering_info.depthAttachmentFormat = InternalConfig::componentFormats[1];
+    rendering_info.depthAttachmentFormat = InternalConfig::componentAttachFormats[1];
 
     VkGraphicsPipelineCreateInfo gfx_infos =
     {
@@ -471,8 +471,7 @@ static PipelineMP<1> makeShadowDrawPipeline(const vk::Device &dev,
 
     // Depth/Stencil
     VkPipelineDepthStencilStateCreateInfo depth_info {};
-    depth_info.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
+    depth_info.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
     depth_info.depthTestEnable = VK_TRUE;
     depth_info.depthWriteEnable = VK_TRUE;
     depth_info.depthCompareOp = VK_COMPARE_OP_GREATER_OR_EQUAL;
diff --git a/src/render/render_common.hpp b/src/render/render_common.hpp
index 1f2b5a9f..334d4530 100644
--- a/src/render/render_common.hpp
+++ b/src/render/render_common.hpp
@@ -40,12 +40,6 @@ inline constexpr uint32_t componentBytes[maxComponents] = {
     4,  // VK_FORMAT_R8G8B8A8_UNORM (normals)
     4   // VK_FORMAT_R32_SINT (segmentation: 4 bytes per int32)
 };
-inline constexpr VkFormat componentFormats[maxComponents] = {
-    VK_FORMAT_R8G8B8A8_UNORM,   // RGB
-    VK_FORMAT_D32_SFLOAT,       // depth
-    VK_FORMAT_R8G8B8A8_UNORM,   // normals
-    VK_FORMAT_R32_SINT          // segmentation
-};
 inline constexpr VkFormat componentAttachFormats[maxComponents] = {
     VK_FORMAT_R32G32B32A32_SFLOAT,  // RGB (float4)
     VK_FORMAT_D32_SFLOAT,           // depth (float)
diff --git a/src/render/render_ctx.cpp b/src/render/render_ctx.cpp
index ffb0e476..423d3b6c 100644
--- a/src/render/render_ctx.cpp
+++ b/src/render/render_ctx.cpp
@@ -1420,12 +1420,12 @@ RenderContext::RenderContext(
     }
 
     {
-        VkDescriptorBindingFlags flags[] = { VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT, 0 };
+        VkDescriptorBindingFlags flags[] = { VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT, 0, 0 };
 
         VkDescriptorSetLayoutBindingFlagsCreateInfo flag_info = {
             .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO,
             .pNext = nullptr,
-            .bindingCount = 2,
+            .bindingCount = 3,
             .pBindingFlags = flags,
         };
 
@@ -2098,102 +2098,81 @@ CountT RenderContext::loadObjects(Span<const imp::SourceObject> src_objs,
     }
 
     // DynArray<VkWriteDescriptorSet> desc_updates(9 + (material_textures_.size() > 0 ? 2 : 0));
-    DynArray<VkWriteDescriptorSet> desc_updates(100);
+    DynArray<VkWriteDescriptorSet> desc_updates(12);
 
     VkDescriptorBufferInfo obj_info;
     obj_info.buffer = asset_buffer.buffer;
     obj_info.offset = 0;
     obj_info.range = buffer_sizes[0];
-
-    desc_updates.push_back({});
-    DescHelper::storage(desc_updates[0], asset_set_cull_, &obj_info, 0);
+    DescHelper::storage(desc_updates.emplace_back(), asset_set_cull_, &obj_info, 0);
 
     VkDescriptorBufferInfo mesh_info;
     mesh_info.buffer = asset_buffer.buffer;
     mesh_info.offset = buffer_offsets[0];
     mesh_info.range = buffer_sizes[1];
-
-    desc_updates.push_back({});
-    DescHelper::storage(desc_updates[1], asset_set_cull_, &mesh_info, 1);
+    DescHelper::storage(desc_updates.emplace_back(), asset_set_cull_, &mesh_info, 1);
+    DescHelper::storage(desc_updates.emplace_back(), asset_batch_lighting_set_, &mesh_info, 1);
 
     VkDescriptorBufferInfo vert_info;
     vert_info.buffer = asset_buffer.buffer;
     vert_info.offset = buffer_offsets[1];
     vert_info.range = buffer_sizes[2];
-
-    desc_updates.push_back({});
-    DescHelper::storage(desc_updates[2], asset_set_draw_, &vert_info, 0);
+    DescHelper::storage(desc_updates.emplace_back(), asset_set_draw_, &vert_info, 0);
+    DescHelper::storage(desc_updates.emplace_back(), asset_batch_lighting_set_, &vert_info, 0);
 
     VkDescriptorBufferInfo mat_info;
     mat_info.buffer = asset_buffer.buffer;
     mat_info.offset = buffer_offsets[3];
     mat_info.range = buffer_sizes[4];
-
-    desc_updates.push_back({});
-    DescHelper::storage(desc_updates[3], asset_set_draw_, &mat_info, 1);
+    DescHelper::storage(desc_updates.emplace_back(), asset_set_draw_, &mat_info, 1);
+    DescHelper::storage(desc_updates.emplace_back(), asset_batch_lighting_set_, &mat_info, 2);
 
     VkDescriptorBufferInfo index_set_info;
     index_set_info.buffer = asset_buffer.buffer;
     index_set_info.offset = buffer_offsets[2];
     index_set_info.range = buffer_sizes[3];
-
-    desc_updates.push_back({});
-    DescHelper::storage(desc_updates[4], index_buffer_set, &index_set_info, 0);
-
-    desc_updates.push_back({});
-    DescHelper::storage(desc_updates[5], asset_batch_lighting_set_, &vert_info, 0);
-
-    desc_updates.push_back({});
-    DescHelper::storage(desc_updates[6], asset_batch_lighting_set_, &mesh_info, 1);
-
-    desc_updates.push_back({});
-    DescHelper::storage(desc_updates[7], asset_batch_lighting_set_, &mat_info, 2);
+    DescHelper::storage(desc_updates.emplace_back(), index_buffer_set, &index_set_info, 0);
 
     VkDescriptorBufferInfo aabb_set_info;
     aabb_set_info.buffer = asset_buffer.buffer;
     aabb_set_info.offset = buffer_offsets[4];
     aabb_set_info.range = buffer_sizes[5];
-
-    desc_updates.push_back({});
-    DescHelper::storage(desc_updates[8], aabb_buffer_set, &aabb_set_info, 0);
+    DescHelper::storage(desc_updates.emplace_back(), aabb_buffer_set, &aabb_set_info, 0);
 
     if (textures.size()) {
         material_textures_ = loadTextures(dev, alloc, renderQueue, textures);
     }
 
-    if (material_textures_.size()) {
-        DynArray<VkDescriptorImageInfo> tx_infos(material_textures_.size());
-        for (auto &tx : material_textures_) {
-            tx_infos.push_back({
-                .sampler = VK_NULL_HANDLE,
-                .imageView = tx.view,
-                .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
-            });
-        }
-        desc_updates.push_back({});
-        DescHelper::textures(desc_updates[9], asset_set_mat_tex_, tx_infos.data(), tx_infos.size(), 0);
-        
-        VkDescriptorBufferInfo mat_tx_info;
-        mat_tx_info.buffer = asset_buffer.buffer;
-        mat_tx_info.offset = buffer_offsets[5];
-        mat_tx_info.range = buffer_sizes[6];
-
-        desc_updates.push_back({});
-        DescHelper::storage(desc_updates[10], asset_set_mat_tex_, &mat_tx_info, 2);
+    DynArray<VkDescriptorImageInfo> tx_infos(material_textures_.size());
+    for (const auto &tx : material_textures_) {
+        tx_infos.push_back({
+            .sampler = VK_NULL_HANDLE,
+            .imageView = tx.view,
+            .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
+        });
+    }
+    if (tx_infos.size()) {
+        DescHelper::textures(desc_updates.emplace_back(), asset_set_mat_tex_, tx_infos.data(), tx_infos.size(), 0);
+    }
+    
+    VkDescriptorBufferInfo mat_tx_info;
+    mat_tx_info.buffer = asset_buffer.buffer;
+    mat_tx_info.offset = buffer_offsets[5];
+    mat_tx_info.range = buffer_sizes[6];
+    if (tx_infos.size()) {
+        DescHelper::storage(desc_updates.emplace_back(), asset_set_mat_tex_, &mat_tx_info, 2);
     }
 
     DescHelper::update(dev, desc_updates.data(), desc_updates.size());
 
-    AssetData asset_data {
+    loaded_assets_.emplace_back(AssetData {
         std::move(asset_buffer),
         (uint32_t)buffer_offsets[2],
         index_buffer_set,
         buffer_offsets[4],
         buffer_sizes[5],
         aabb_buffer_set
-    };
-
-    loaded_assets_.emplace_back(std::move(asset_data));
+    });
 
     return 0;
 }
diff --git a/src/render/shaders/post_process.hlsl b/src/render/shaders/post_process.hlsl
index 9969df1f..65b4c46c 100644
--- a/src/render/shaders/post_process.hlsl
+++ b/src/render/shaders/post_process.hlsl
@@ -372,7 +372,7 @@ float3 applyFXAA(uint32_t view_idx, int2 coord) {
 }
 
 // ------------------------------------------------------------------------------------------------
-[numThreads(16, 16, 1)]
+[numThreads(32, 32, 1)]
 [shader("compute")]
 void main(uint3 idx : SV_DispatchThreadID)
 {
diff --git a/src/render/shaders/viewer_draw.hlsl b/src/render/shaders/viewer_draw.hlsl
index d4fcd1b2..77129600 100644
--- a/src/render/shaders/viewer_draw.hlsl
+++ b/src/render/shaders/viewer_draw.hlsl
@@ -160,9 +160,9 @@ PixelOutput frag(in V2F v2f)
     float4 color = mat_data.color;
 
     int texture_idx = -1;
-    uint texture_count = mat_data.textureCount;
+    uint texture_count = mat_data.numTextures;
     if (texture_count > 0) {
-        uint texture_start = mat_data.numTextures;
+        uint texture_start = mat_data.textureOffset;
         texture_idx = materialTexturesIndices[texture_start + v2f.worldIdx % texture_count];
     }
     if (texture_idx != -1) {

From 89fd6681df13fb5cbe68b0436362d1f6b68328d2 Mon Sep 17 00:00:00 2001
From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com>
Date: Thu, 4 Dec 2025 07:03:43 -0500
Subject: [PATCH 08/11] fix(7) other validation

---
 src/render/batch_renderer.cpp          | 11 ++++++++---
 src/render/shaders/batch_draw_rgb.hlsl |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/render/batch_renderer.cpp b/src/render/batch_renderer.cpp
index 28aa7724..ecbf6bd3 100644
--- a/src/render/batch_renderer.cpp
+++ b/src/render/batch_renderer.cpp
@@ -1054,8 +1054,8 @@ static void issueComputeLayoutTransitions(
         return VkImageMemoryBarrier{
             .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
             .pNext = nullptr,
-            .srcAccessMask = isDepth ? VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT
-                                     : VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
+            .srcAccessMask = isDepth ? VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT
+                                     : VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
             .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
             .oldLayout = isDepth ? VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL
                                  : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
@@ -1704,6 +1704,11 @@ BatchRenderer::~BatchRenderer()
     impl->dev.dt.destroyPipeline(impl->dev.hdl, impl->shadowDraw.hdls[0], nullptr);
     impl->dev.dt.destroyPipelineLayout(impl->dev.hdl, impl->shadowDraw.layout, nullptr);
 
+    if (impl->postProcess.has_value()) {
+        impl->dev.dt.destroyPipeline(impl->dev.hdl, impl->postProcess->hdls[0], nullptr);
+        impl->dev.dt.destroyPipelineLayout(impl->dev.hdl, impl->postProcess->layout, nullptr);
+    }
+
     for (CountT i = 0; i < impl->batchFrames.size(); i++) {
         impl->dev.dt.destroyCommandPool(impl->dev.hdl, impl->batchFrames[i].prepareCmdPool, nullptr);
         impl->dev.dt.destroyCommandPool(impl->dev.hdl, impl->batchFrames[i].renderCmdPool, nullptr);
@@ -1721,7 +1726,7 @@ BatchRenderer::~BatchRenderer()
             impl->dev.dt.destroyImageView(impl->dev.hdl, impl->batchFrames[i].targets[j].shadowMapView, nullptr);
             impl->dev.dt.destroyImageView(impl->dev.hdl, impl->batchFrames[i].targets[j].shadowDepthView, nullptr);
         }
-    }        
+    }
 
     impl->dev.dt.destroyQueryPool(impl->dev.hdl, impl->timeQueryPool, nullptr);
 }
diff --git a/src/render/shaders/batch_draw_rgb.hlsl b/src/render/shaders/batch_draw_rgb.hlsl
index 554bd40f..43929f47 100644
--- a/src/render/shaders/batch_draw_rgb.hlsl
+++ b/src/render/shaders/batch_draw_rgb.hlsl
@@ -113,7 +113,7 @@ void vert(in uint vid : SV_VertexID,
 
     v2f.worldPos = rotateVec(instance_data.rotation, instance_data.scale * vert.position) + instance_data.position;
     v2f.position = clip_pos;
-    v2f.uv = float2(vert.uv.x, 1.0f - vert.uv.y);
+    v2f.uv = vert.uv;
     v2f.worldNormal = rotateVec(instance_data.rotation, vert.normal);
     v2f.worldIdx = instance_data.worldID;
     v2f.viewIdx = draw_data.viewID;

From a2bf345c107fe21b5fa953add31a054f7f252903 Mon Sep 17 00:00:00 2001
From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com>
Date: Fri, 5 Dec 2025 02:16:00 -0500
Subject: [PATCH 09/11] update batch raytrace

---
 include/madrona/mesh_bvh.hpp                  |   3 +-
 .../render/cuda_batch_render_assets.hpp       |   5 +-
 src/bridge/mgr.cpp                            |  29 ++--
 src/mw/device/bvh_raycast.cpp                 |  95 ++++--------
 src/mw/device/ecs_render_bridge.cpp           |  58 ++-----
 src/mw/device/include/madrona/bvh.hpp         |   1 +
 src/render/asset_processor.cpp                | 142 ++++++++++--------
 7 files changed, 147 insertions(+), 186 deletions(-)

diff --git a/include/madrona/mesh_bvh.hpp b/include/madrona/mesh_bvh.hpp
index 43c51ff4..288faed7 100644
--- a/include/madrona/mesh_bvh.hpp
+++ b/include/madrona/mesh_bvh.hpp
@@ -149,7 +149,8 @@ struct Material {
     // For now, just a color
     math::Vector4 color;
 
-    int32_t textureIdx;
+    int32_t textureOffset;
+    int32_t numTextures;
 
     float roughness;
     float metalness;
diff --git a/include/madrona/render/cuda_batch_render_assets.hpp b/include/madrona/render/cuda_batch_render_assets.hpp
index 42750006..8fd7a514 100644
--- a/include/madrona/render/cuda_batch_render_assets.hpp
+++ b/include/madrona/render/cuda_batch_render_assets.hpp
@@ -22,10 +22,11 @@ struct MeshBVHData {
 struct MaterialData {
     // GPU buffer containing array of texture objects
     cudaTextureObject_t *textures;
-    uint32_t numTextureBuffers;
     cudaArray_t *textureBuffers;
-    uint32_t numMipmapTextureBuffers;
+    uint32_t numTextureBuffers;
     cudaMipmappedArray_t *mipmapTextureBuffers;
+    uint32_t numMipmapTextureBuffers;
+    int32_t *materialTextures;
     Material *materials;
 };
 
diff --git a/src/bridge/mgr.cpp b/src/bridge/mgr.cpp
index 7cfcca0a..58718ca0 100644
--- a/src/bridge/mgr.cpp
+++ b/src/bridge/mgr.cpp
@@ -489,24 +489,25 @@ static RTAssets loadRenderObjects(
         };
     }
 
-    SourceTexture *out_textures = tmp_alloc.allocN<SourceTexture>(model.numTextures);
+    // Use HeapArray with StackAlloc for consistency and better performance
+    // StackAlloc provides efficient temporary allocations that are automatically cleaned up
+    HeapArray<SourceTexture> textures(model.numTextures, tmp_alloc);
 
     for (CountT i = 0; i < model.numTextures; i++) {
         // TODO: NChans is not used.
         uint64_t tex_offset = model.texOffsets[i];
-        Optional<SourceTexture> tex = SourceTexture {
+        textures[i] = SourceTexture {
             .data = model.texData + tex_offset,
             .format = SourceTextureFormat::R8G8B8A8,
             .width = (uint32_t)model.texWidths[i],
             .height = (uint32_t)model.texHeights[i],
             .numBytes = (size_t)(model.texWidths[i] * model.texHeights[i] * 4),
         };
-        out_textures[i] = *tex;
     }
 
-    Span<imp::SourceTexture> imported_textures = Span(out_textures, model.numTextures);
-
-    std::vector<imp::SourceMaterial> materials;
+    // Use HeapArray instead of std::vector since size is known upfront
+    // This avoids dynamic reallocation and provides better performance
+    HeapArray<imp::SourceMaterial> materials(model.numMats, tmp_alloc);
     for (CountT i = 0; i < model.numMats; i++) {
         const math::Vector4 &rgba = model.matRGBA[i];
         uint32_t mat_tex_offset = model.matTexOffsets[i];
@@ -514,14 +515,13 @@ static RTAssets loadRenderObjects(
             model.matTexOffsets[i + 1] : model.numMatTextures;
         uint32_t mat_tex_num = next_tex_offset - mat_tex_offset;
 
-        SourceMaterial mat = {
+        materials[i] = SourceMaterial {
             .color = math::Vector4{rgba.x, rgba.y, rgba.z, rgba.w},
             .textureIdx = model.matTexIDs + mat_tex_offset,
             .numTextures = mat_tex_num,
             .roughness = 0.0f,
             .metalness = 0.0f
         };
-        materials.push_back(mat);
     }
 
     // Create materials for geoms that do not have one assigned
@@ -629,16 +629,19 @@ static RTAssets loadRenderObjects(
     }
 
     if (render_mgr.has_value()) {
-        render_mgr->loadObjects(objs, materials, imported_textures);
+        // HeapArray automatically converts to Span via template constructor
+        render_mgr->loadObjects(objs, materials, textures);
     }
 
     if (use_rt) {
         auto ret = RTAssets {
             render::AssetProcessor::makeBVHData(objs),
-            render::AssetProcessor::initMaterialData(materials.data(),
-                                     materials.size(),
-                                     imported_textures.data(),
-                                     imported_textures.size())
+            render::AssetProcessor::initMaterialData(
+                materials.data(),
+                materials.size(),
+                textures.data(),
+                textures.size()
+            )
         };
 
         return ret;
diff --git a/src/mw/device/bvh_raycast.cpp b/src/mw/device/bvh_raycast.cpp
index 64e61df8..d09da6c9 100644
--- a/src/mw/device/bvh_raycast.cpp
+++ b/src/mw/device/bvh_raycast.cpp
@@ -602,22 +602,17 @@ static __device__ TraceResult traceRay(
             // Intersect with the children of the child to get a new node group
             // and calculate the present bits according to which were
             // intersected
-            uint32_t child_node_idx =
-                node_buffer[current_grp & 0xFFFF'FFFF].childrenIdx[child_idx];
+            uint32_t child_node_idx = node_buffer[current_grp & 0xFFFF'FFFF].childrenIdx[child_idx];
             bool child_is_leaf = (child_node_idx & 0x8000'0000);
             child_node_idx = child_node_idx & (~0x8000'0000);
 
             GroupType new_grp_type = GroupType::TopLevel;
 
-            if (parent_grp_type == GroupType::TopLevel &&
-                child_is_leaf) {
+            if (parent_grp_type == GroupType::TopLevel && child_is_leaf) {
                 // Need to compute new ray o/d/etc...
                 instance_idx = (int32_t)(child_node_idx & ~0x8000'0000);
-                InstanceData *instance_data = world_info.instances + 
-                                              instance_idx;
-
-                current_bvh = bvhParams.bvhs + 
-                              world_info.instances[instance_idx].objectID;
+                InstanceData *instance_data = world_info.instances + instance_idx;
+                current_bvh = bvhParams.bvhs + world_info.instances[instance_idx].objectID;
 
                 // Should be able to just do a continue in this case - we'll
                 // just resume processing the parent node
@@ -625,11 +620,9 @@ static __device__ TraceResult traceRay(
                     continue;
 
                 ray_o = instance_data->scale.inv() *
-                    instance_data->rotation.inv().rotateVec(
-                            (ray_o - instance_data->position));
+                    instance_data->rotation.inv().rotateVec(ray_o - instance_data->position);
                 ray_d = instance_data->scale.inv() *
-                    instance_data->rotation.inv().rotateVec(
-                            ray_d);
+                    instance_data->rotation.inv().rotateVec(ray_d);
                 t_scale = ray_d.length();
                 t_max *= t_scale;
 
@@ -682,12 +675,10 @@ static __device__ TraceResult traceRay(
                 }
             }
 
-            current_grp = encodeNodeGroup(
-                    child_node_idx, grp_present_bits, new_grp_type);
+            current_grp = encodeNodeGroup(child_node_idx, grp_present_bits, new_grp_type);
 
             if (tri_present_bits) {
-                triangle_grp = encodeNodeGroup(
-                        child_node_idx, tri_present_bits, GroupType::Triangles);
+                triangle_grp = encodeNodeGroup(child_node_idx, tri_present_bits, GroupType::Triangles);
             } else {
                 triangle_grp = invalidNodeGroup();
             }
@@ -701,28 +692,16 @@ static __device__ TraceResult traceRay(
             while (getTrianglePresentBits(triangle_grp) != 0) {
                 // TODO: check active mask against heuristic to exit if not enough
                 // threads are working on this
-                uint32_t local_node_tri_idx = 
-                    __ffs(getTrianglePresentBits(triangle_grp)) - 1;
-
-                uint32_t local_leaf_idx = 
-                    local_node_tri_idx / MeshBVH::numTrisPerLeaf;
-                uint32_t tri_idx = 
-                    local_node_tri_idx % MeshBVH::numTrisPerLeaf;
-
-                uint32_t glob_leaf_idx =
-                    parent.childrenIdx[local_leaf_idx] & (~0x8000'0000);
+                uint32_t local_node_tri_idx = __ffs(getTrianglePresentBits(triangle_grp)) - 1;
+                uint32_t local_leaf_idx = local_node_tri_idx / MeshBVH::numTrisPerLeaf;
+                uint32_t tri_idx = local_node_tri_idx % MeshBVH::numTrisPerLeaf;
+                uint32_t glob_leaf_idx = parent.childrenIdx[local_leaf_idx] & (~0x8000'0000);
 
                 TriHitInfo hit_info = triangleIntersect(
-                        glob_leaf_idx,
-                        tri_idx,
-                        isect_info,
-                        ray_o,
-                        t_max,
-                        current_bvh);
+                    glob_leaf_idx, tri_idx, isect_info, ray_o, t_max, current_bvh);
 
                 if (hit_info.hit) {
                     t_max = hit_info.tHit;
-
                     tri_hit = hit_info;
                     tri_hit.instanceIdx = instance_idx;
                 }
@@ -768,8 +747,7 @@ static __device__ TraceResult traceRay(
             int32_t material_idx = override_mat_id;
 
             if (override_mat_id == MaterialOverride::UseDefaultMaterial) {
-                material_idx = tri_hit.bvh->getMaterialIDX(
-                        tri_hit.leafMaterialIndex);
+                material_idx = tri_hit.bvh->getMaterialIDX(tri_hit.leafMaterialIndex);
             }
 
             Vector3 color = { 1.f, 1.f, 1.f };
@@ -778,9 +756,9 @@ static __device__ TraceResult traceRay(
                 color = hexToRgb(instance->color);
             } else if (material_idx != -1) {
                 Material *mat = &bvhParams.materials[material_idx];
-
-                if (mat->textureIdx != -1) {
-                    cudaTextureObject_t *tex = &bvhParams.textures[mat->textureIdx];
+                if (mat->numTextures > 0) {
+                    int32_t texture_idx = bvhParams.materialTextures[mat->textureOffset + instance->worldIDX % mat->numTextures];
+                    cudaTextureObject_t *tex = bvhParams.textures + texture_idx;
 
                     // --- Mipmap LOD selection ---
                     // Estimate LOD based on distance from camera and UV footprint
@@ -799,12 +777,13 @@ static __device__ TraceResult traceRay(
                     // Clamp LOD to [0, 8]
                     lod = fminf(fmaxf(lod, 0.0f), 8.0f);
 
-                    float4 sampled_color = tex2DLod<float4>(*tex,
-                            tri_hit.uv.x, tri_hit.uv.y, lod);
+                    float4 sampled_color = tex2DLod<float4>(*tex, tri_hit.uv.x, tri_hit.uv.y, lod);
 
-                    Vector3 tex_color = { sampled_color.x,
+                    Vector3 tex_color = {
+                        sampled_color.x,
                         sampled_color.y,
-                        sampled_color.z };
+                        sampled_color.z
+                    };
 
                     color.x = tex_color.x * mat->color.x;
                     color.y = tex_color.y * mat->color.y;
@@ -942,13 +921,13 @@ static __device__ FragmentResult computeFragment(
                     // TODO: Definitely do some sort of ray fetching because there will
                     // be threads doing nothing potentially.
                     TraceResult shadow_hit = traceRay(
-                            TraceInfo {
+                        TraceInfo {
                             .rayOrigin = hit_pos,
                             .rayDirection = -ray_dir,
                             .tMin = 0.000001f,
                             .tMax = 10000.f,
                             .dOnly = true
-                            }, world_info);
+                        }, world_info);
                     if(shadow_hit.hit) {
                         continue;
                     }
@@ -1004,29 +983,20 @@ extern "C" __global__ void bvhRaycastEntry()
     const uint32_t resident_view_offset = blockIdx.x;
 
     uint32_t current_view_offset = resident_view_offset;
-
-    uint32_t bytes_per_view =
-        bvhParams.renderOutputWidth * bvhParams.renderOutputHeight * 4;
-
+    uint32_t bytes_per_view = bvhParams.renderOutputWidth * bvhParams.renderOutputHeight * 4;
     uint32_t num_processed_pixels = 0;
-
     uint32_t pixel_x = blockIdx.y * pixels_per_block + threadIdx.x;
     uint32_t pixel_y = blockIdx.z * pixels_per_block + threadIdx.y;
 
     while (current_view_offset < total_num_views) {
         // While we still have views to generate, trace.
-        PerspectiveCameraData *view_data = 
-            &bvhParams.views[current_view_offset];
-
+        PerspectiveCameraData *view_data = &bvhParams.views[current_view_offset];
         uint32_t world_idx = (uint32_t)view_data->worldIDX;
-
         Vector3 ray_start = view_data->position;
         Vector3 ray_dir = calculateOutRay(view_data, pixel_x, pixel_y);
 
         uint32_t internal_nodes_offset = bvhParams.instanceOffsets[world_idx];
 
-
-
         // This does both the tracing / lighting, etc... just like a fragment
         // shader does in GLSL.
         FragmentResult result = computeFragment(
@@ -1038,22 +1008,15 @@ extern "C" __global__ void bvhRaycastEntry()
                 .dOnly = false
             },
             TraceWorldInfo {
-                .nodes = bvhParams.internalData->traversalNodes + 
-                         internal_nodes_offset,
+                .nodes = bvhParams.internalData->traversalNodes + internal_nodes_offset,
                 .instances = bvhParams.instances + internal_nodes_offset,
                 .lights = &bvhParams.lights[bvhParams.lightOffsets[world_idx]],
                 .numLights = (uint32_t)bvhParams.lightCounts[world_idx]
             }
         );
 
-
-
-
-        uint32_t linear_pixel_idx = 4 * 
-            (pixel_x + pixel_y * bvhParams.renderOutputWidth);
-
-        uint32_t global_pixel_byte_off = current_view_offset * bytes_per_view +
-            linear_pixel_idx;
+        uint32_t linear_pixel_idx = 4 * (pixel_x + pixel_y * bvhParams.renderOutputWidth);
+        uint32_t global_pixel_byte_off = current_view_offset * bytes_per_view + linear_pixel_idx;
 
         if (bvhParams.raycastRGBD) {
             // Write both depth and color information
diff --git a/src/mw/device/ecs_render_bridge.cpp b/src/mw/device/ecs_render_bridge.cpp
index e41de39b..dba21fd4 100644
--- a/src/mw/device/ecs_render_bridge.cpp
+++ b/src/mw/device/ecs_render_bridge.cpp
@@ -10,6 +10,7 @@ extern "C" __global__ void initBVHParams(madrona::BVHParams *params,
                                          void *timings,
                                          void *materials,
                                          void *textures,
+                                         void *material_textures,
                                          uint32_t num_sms,
                                          uint32_t sm_shared_memory)
 {
@@ -20,44 +21,22 @@ extern "C" __global__ void initBVHParams(madrona::BVHParams *params,
     StateManager *mgr = mwGPU::getStateManager();
     mwGPU::HostAllocator *host_alloc = mwGPU::getHostAllocator();
     mwGPU::TmpAllocator *tmp_alloc = &mwGPU::TmpAllocator::get();
-    mwGPU::HostPrint *host_print = 
-        (mwGPU::HostPrint *)mwGPU::GPUImplConsts::get().hostPrintAddr;
+    mwGPU::HostPrint *host_print = (mwGPU::HostPrint *)mwGPU::GPUImplConsts::get().hostPrintAddr;
     uint32_t raycast_rgbd = mwGPU::GPUImplConsts::get().raycastRGBD;
 
     params->numWorlds = num_worlds;
 
-    params->instances = mgr->getArchetypeComponent<
-        RenderableArchetype, InstanceData>();
-
-    params->views = mgr->getArchetypeComponent<
-        RenderCameraArchetype, PerspectiveCameraData>();
-
-    params->lights = mgr->getArchetypeComponent<
-        LightArchetype, LightDesc>();
-
-    params->instanceOffsets = (int32_t *)mgr->getArchetypeWorldOffsets<
-        RenderableArchetype>();
-
-    params->instanceCounts = (int32_t *)mgr->getArchetypeWorldCounts<
-        RenderableArchetype>();
-
-    params->aabbs = (TLBVHNode *)mgr->getArchetypeComponent<
-        RenderableArchetype, TLBVHNode>();
-
-    params->viewOffsets = (int32_t *)mgr->getArchetypeWorldOffsets<
-        RenderCameraArchetype>();
-
-    params->viewCounts = (int32_t *)mgr->getArchetypeWorldCounts<
-        RenderCameraArchetype>();
-
-    params->lightOffsets = (int32_t *)mgr->getArchetypeWorldOffsets<
-        LightArchetype>();
-
-    params->lightCounts = (int32_t *)mgr->getArchetypeWorldCounts<
-        LightArchetype>();
-
-    params->mortonCodes = (uint32_t *)mgr->getArchetypeComponent<
-        RenderableArchetype, MortonCode>();
+    params->instances = mgr->getArchetypeComponent<RenderableArchetype, InstanceData>();
+    params->views = mgr->getArchetypeComponent<RenderCameraArchetype, PerspectiveCameraData>();
+    params->lights = mgr->getArchetypeComponent<LightArchetype, LightDesc>();
+    params->instanceOffsets = (int32_t *)mgr->getArchetypeWorldOffsets<RenderableArchetype>();
+    params->instanceCounts = (int32_t *)mgr->getArchetypeWorldCounts<RenderableArchetype>();
+    params->aabbs = (TLBVHNode *)mgr->getArchetypeComponent<RenderableArchetype, TLBVHNode>();
+    params->viewOffsets = (int32_t *)mgr->getArchetypeWorldOffsets<RenderCameraArchetype>();
+    params->viewCounts = (int32_t *)mgr->getArchetypeWorldCounts<RenderCameraArchetype>();
+    params->lightOffsets = (int32_t *)mgr->getArchetypeWorldOffsets<LightArchetype>();
+    params->lightCounts = (int32_t *)mgr->getArchetypeWorldCounts<LightArchetype>();
+    params->mortonCodes = (uint32_t *)mgr->getArchetypeComponent<RenderableArchetype, MortonCode>();
 
     params->bvhs = (MeshBVH *)bvhs;
 
@@ -65,20 +44,15 @@ extern "C" __global__ void initBVHParams(madrona::BVHParams *params,
 
     params->rgbOutput = (void *)mgr->getArchetypeComponent<
         RaycastOutputArchetype, render::RGBOutputBuffer>();
-
     params->depthOutput = (void *)mgr->getArchetypeComponent<
         RaycastOutputArchetype, render::DepthOutputBuffer>();
-
     params->normalOutput = (void *)mgr->getArchetypeComponent<
         RaycastOutputArchetype, render::NormalOutputBuffer>();
-
     params->segmentationOutput = (void *)mgr->getArchetypeComponent<
         RaycastOutputArchetype, render::SegmentationOutputBuffer>();
 
-    params->renderOutputWidth = 
-        mwGPU::GPUImplConsts::get().raycastOutputWidth;
-    params->renderOutputHeight = 
-        mwGPU::GPUImplConsts::get().raycastOutputHeight;
+    params->renderOutputWidth = mwGPU::GPUImplConsts::get().raycastOutputWidth;
+    params->renderOutputHeight = mwGPU::GPUImplConsts::get().raycastOutputHeight;
 
     params->internalData = (BVHInternalData *)internal_data;
 
@@ -87,8 +61,8 @@ extern "C" __global__ void initBVHParams(madrona::BVHParams *params,
     params->hostPrintAddr = (void *)host_print;
 
     params->materials = (Material *)materials;
-
     params->textures = (cudaTextureObject_t *)textures;
+    params->materialTextures = (int32_t *)material_textures;
 
     params->raycastRGBD = raycast_rgbd;
 
diff --git a/src/mw/device/include/madrona/bvh.hpp b/src/mw/device/include/madrona/bvh.hpp
index 19a36b71..6890e7b4 100644
--- a/src/mw/device/include/madrona/bvh.hpp
+++ b/src/mw/device/include/madrona/bvh.hpp
@@ -153,6 +153,7 @@ struct BVHParams {
 
     Material *materials;
     cudaTextureObject_t *textures;
+    int32_t *materialTextures;
 
     // Used to determine how many thread blocks per SM.
     uint32_t numSMs;
diff --git a/src/render/asset_processor.cpp b/src/render/asset_processor.cpp
index a2206474..d1f62ee3 100644
--- a/src/render/asset_processor.cpp
+++ b/src/render/asset_processor.cpp
@@ -318,16 +318,23 @@ MaterialData initMaterialData(
     uint32_t num_textures)
 {
     // TODO: Only generate mipmaps for RGBA textures
-    // Count number of BC7 textures and number of non-BC7 textures
+    // Count textures by type:
+    // - BC7 (compressed): uses non-mipmapped cudaArray_t → textureBuffers
+    // - RGBA (uncompressed): uses mipmapped cudaMipmappedArray_t → mipmapTextureBuffers
     uint32_t num_non_mipmap_textures = 0;
     uint32_t num_mipmap_textures = 0;
     for (uint32_t i = 0; i < num_textures; ++i) {
         if (textures[i].format == imp::SourceTextureFormat::BC7) {
-            num_mipmap_textures++;
-        } else {
             num_non_mipmap_textures++;
+        } else {
+            num_mipmap_textures++;
         }
     }
+    
+    uint32_t num_material_textures = 0;
+    for (uint32_t i = 0; i < num_materials; ++i) {
+        num_material_textures += materials[i].numTextures;
+    }
 
     MaterialData cpu_mat_data = {
         .textures = (cudaTextureObject_t *) malloc(sizeof(cudaTextureObject_t) * num_textures),
@@ -335,65 +342,73 @@ MaterialData initMaterialData(
         .numTextureBuffers = num_non_mipmap_textures,
         .mipmapTextureBuffers = (cudaMipmappedArray_t *) malloc(sizeof(cudaMipmappedArray_t) * num_mipmap_textures),
         .numMipmapTextureBuffers = num_mipmap_textures,
+        .materialTextures = (int32_t *) malloc(sizeof(int32_t) * num_material_textures),
         .materials = (Material *) malloc(sizeof(Material) * num_materials)
     };
 
+    // Separate indices for texture buffer arrays (they have different sizes)
+    uint32_t non_mipmap_idx = 0;
+    uint32_t mipmap_idx = 0;
+
+    // Helper function to create texture descriptor (reduces duplication)
+    auto createTextureDesc = [](bool use_mipmap, uint32_t max_mip_level = 0) {
+        cudaTextureDesc tex_desc = {};
+        tex_desc.addressMode[0] = cudaAddressModeWrap;
+        tex_desc.addressMode[1] = cudaAddressModeWrap;
+        tex_desc.filterMode = cudaFilterModeLinear;
+        tex_desc.readMode = cudaReadModeNormalizedFloat;
+        tex_desc.normalizedCoords = 1;
+        tex_desc.sRGB = 1;
+        if (use_mipmap) {
+            tex_desc.mipmapFilterMode = cudaFilterModeLinear;
+            tex_desc.maxMipmapLevelClamp = max_mip_level;
+        }
+        return tex_desc;
+    };
+
     for (uint32_t i = 0; i < num_textures; ++i) {
         const auto &tex = textures[i];
-        int width, height;
-        void *pixels = nullptr;
 
         if (tex.format == imp::SourceTextureFormat::BC7) {
-            width = tex.width;
-            height = tex.height;
-
-            cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>();
+            // BC7 compressed textures: use non-mipmapped array
+            cudaChannelFormatDesc channel_desc = 
+                cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>();
             cudaArray_t cuda_array;
-            REQ_CUDA(cudaMallocArray(&cuda_array, &channel_desc, width, height, cudaArrayDefault));
+            REQ_CUDA(cudaMallocArray(&cuda_array, &channel_desc, 
+                                     tex.width, tex.height, cudaArrayDefault));
 
             REQ_CUDA(cudaMemcpy2DToArray(cuda_array, 0, 0, tex.data,
-                                         16 * width / 4,
-                                         16 * width / 4,
-                                         height / 4,
+                                         16 * tex.width / 4,
+                                         16 * tex.width / 4,
+                                         tex.height / 4,
                                          cudaMemcpyHostToDevice));
 
             cudaResourceDesc res_desc = {};
             res_desc.resType = cudaResourceTypeArray;
             res_desc.res.array.array = cuda_array;
 
-            cudaTextureDesc tex_desc = {};
-            tex_desc.addressMode[0] = cudaAddressModeWrap;
-            tex_desc.addressMode[1] = cudaAddressModeWrap;
-            tex_desc.filterMode = cudaFilterModeLinear;
-            tex_desc.readMode = cudaReadModeNormalizedFloat;
-            tex_desc.normalizedCoords = 1;
-            tex_desc.sRGB = 1;
-
+            cudaTextureDesc tex_desc = createTextureDesc(false);
             cudaTextureObject_t tex_obj = 0;
             REQ_CUDA(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, nullptr));
 
             cpu_mat_data.textures[i] = tex_obj;
-            cpu_mat_data.textureBuffers[i] = cuda_array;
+            cpu_mat_data.textureBuffers[non_mipmap_idx++] = cuda_array;
         } else {
-            pixels = tex.data;
-            width = tex.width;
-            height = tex.height; 
-
             // TODO: Only generate mipmaps for RGBA textures
-            // Generate mipmaps
-            const uint32_t MAX_MIPS = 16; // Should be enough for any reasonable texture size
-            void* mip_data[MAX_MIPS];
-            memset(mip_data, 0, sizeof(mip_data));
+            // RGBA uncompressed textures: generate mipmaps
+            const uint32_t MAX_MIPS = 16;
+            void* mip_data[MAX_MIPS] = {};
             uint32_t mip_widths[MAX_MIPS];
             uint32_t mip_heights[MAX_MIPS];
             uint32_t num_mips;
             
-            generateMipmaps(pixels, width, height, mip_data, mip_widths, mip_heights, num_mips);
+            generateMipmaps(tex.data, tex.width, tex.height, 
+                            mip_data, mip_widths, mip_heights, num_mips);
 
-            // Create CUDA array with mipmaps
+            // Create CUDA mipmapped array
             cudaMipmappedArray_t mipArray;
             cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar4>();
-            cudaExtent extent = make_cudaExtent(width, height, 0);
+            cudaExtent extent = make_cudaExtent(tex.width, tex.height, 0);
             REQ_CUDA(cudaMallocMipmappedArray(&mipArray, &channelDesc, extent, num_mips));
 
             // Copy each mip level
@@ -418,56 +433,59 @@ MaterialData initMaterialData(
             res_desc.resType = cudaResourceTypeMipmappedArray;
             res_desc.res.mipmap.mipmap = mipArray;
 
-            cudaTextureDesc tex_desc = {};
-            tex_desc.addressMode[0] = cudaAddressModeWrap;
-            tex_desc.addressMode[1] = cudaAddressModeWrap;
-            tex_desc.filterMode = cudaFilterModeLinear;
-            tex_desc.readMode = cudaReadModeNormalizedFloat;
-            tex_desc.normalizedCoords = 1;
-            tex_desc.sRGB = 1;
-            tex_desc.mipmapFilterMode = cudaFilterModeLinear;
-            tex_desc.maxMipmapLevelClamp = num_mips - 1;
-
+            cudaTextureDesc tex_desc = createTextureDesc(true, num_mips - 1);
             cudaTextureObject_t tex_obj = 0;
-            REQ_CUDA(cudaCreateTextureObject(&tex_obj,
-                        &res_desc, &tex_desc, nullptr));
-
+            REQ_CUDA(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, nullptr));
             cpu_mat_data.textures[i] = tex_obj;
-            cpu_mat_data.mipmapTextureBuffers[i] = mipArray;
+            cpu_mat_data.mipmapTextureBuffers[mipmap_idx++] = mipArray;
         }
     }
 
+    // Populate materials array
+    int32_t material_texture_offset = 0;
     for (uint32_t i = 0; i < num_materials; ++i) {
-        Material mat = {
-            .color = materials[i].color,
-            .textureIdx = materials[i].numTextures > 0 ? *(materials[i].textureIdx) : -1, //TODO: support batch
-            .roughness = materials[i].roughness,
-            .metalness = materials[i].metalness,
+        const SourceMaterial &src_mat = materials[i];
+        cpu_mat_data.materials[i] = Material {
+            .color = src_mat.color,
+            .textureOffset = material_texture_offset,
+            .numTextures = src_mat.numTextures,
+            .roughness = src_mat.roughness,
+            .metalness = src_mat.metalness,
         };
-
-        cpu_mat_data.materials[i] = mat;
+        material_texture_offset += src_mat.numTextures;
     }
 
+    // Copy texture objects and materials to GPU
     cudaTextureObject_t *gpu_tex_buffer;
     REQ_CUDA(cudaMalloc(&gpu_tex_buffer, sizeof(cudaTextureObject_t) * num_textures));
     REQ_CUDA(cudaMemcpy(gpu_tex_buffer, cpu_mat_data.textures, 
                 sizeof(cudaTextureObject_t) * num_textures,
                 cudaMemcpyHostToDevice));
 
-    Material *mat_buffer;
-    REQ_CUDA(cudaMalloc(&mat_buffer, sizeof(Material) * num_materials));
-    REQ_CUDA(cudaMemcpy(mat_buffer, cpu_mat_data.materials, 
+    Material *gpu_mat_buffer;
+    REQ_CUDA(cudaMalloc(&gpu_mat_buffer, sizeof(Material) * num_materials));
+    REQ_CUDA(cudaMemcpy(gpu_mat_buffer, cpu_mat_data.materials, 
                 sizeof(Material) * num_materials,
                 cudaMemcpyHostToDevice));
 
+    int32_t *gpu_mat_texs;
+    REQ_CUDA(cudaMalloc(&gpu_mat_texs, sizeof(int32_t) * num_material_textures));
+    REQ_CUDA(cudaMemcpy(gpu_mat_texs, cpu_mat_data.materialTextures, 
+                sizeof(int32_t) * num_material_textures,
+                cudaMemcpyHostToDevice));
+
+    // Free CPU-side temporary buffers
     free(cpu_mat_data.textures);
     free(cpu_mat_data.materials);
-    free(cpu_mat_data.textureBuffers);
-    free(cpu_mat_data.mipmapTextureBuffers);
-
-    auto gpu_mat_data = cpu_mat_data;
+    free(cpu_mat_data.materialTextures);
+    
+    // Note: textureBuffers and mipmapTextureBuffers are CUDA arrays that remain on GPU
+    // They are not freed here as they're part of the returned MaterialData
+    // Return GPU-side MaterialData
+    MaterialData gpu_mat_data = cpu_mat_data;
     gpu_mat_data.textures = gpu_tex_buffer;
-    gpu_mat_data.materials = mat_buffer;
+    gpu_mat_data.materials = gpu_mat_buffer;
+    gpu_mat_data.materialTextures = gpu_mat_texs;
 
     return gpu_mat_data;
 }

From c6f4a007b106710e2260902301906dbb61c2e37a Mon Sep 17 00:00:00 2001
From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com>
Date: Fri, 5 Dec 2025 03:37:07 -0500
Subject: [PATCH 10/11] fix(1)

---
 include/madrona/mesh_bvh.hpp |  2 +-
 src/bridge/mgr.cpp           | 11 ++++-------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/include/madrona/mesh_bvh.hpp b/include/madrona/mesh_bvh.hpp
index 288faed7..981ccee8 100644
--- a/include/madrona/mesh_bvh.hpp
+++ b/include/madrona/mesh_bvh.hpp
@@ -150,7 +150,7 @@ struct Material {
     math::Vector4 color;
 
     int32_t textureOffset;
-    int32_t numTextures;
+    uint32_t numTextures;
 
     float roughness;
     float metalness;
diff --git a/src/bridge/mgr.cpp b/src/bridge/mgr.cpp
index 58718ca0..d2a40be8 100644
--- a/src/bridge/mgr.cpp
+++ b/src/bridge/mgr.cpp
@@ -401,8 +401,6 @@ static RTAssets loadRenderObjects(
     Optional<render::RenderManager> &render_mgr,
     bool use_rt)
 {
-    StackAlloc tmp_alloc;
-
     std::array<std::string, 1> render_asset_paths;
     const char *py_root_env = getenv("MADRONA_ROOT_PATH");
     std::filesystem::path data_dir = py_root_env ? (std::string(py_root_env) + "/data") : DATA_DIR;
@@ -489,9 +487,8 @@ static RTAssets loadRenderObjects(
         };
     }
 
-    // Use HeapArray with StackAlloc for consistency and better performance
-    // StackAlloc provides efficient temporary allocations that are automatically cleaned up
-    HeapArray<SourceTexture> textures(model.numTextures, tmp_alloc);
+    // Use HeapArray with default allocator - automatically cleaned up when it goes out of scope
+    HeapArray<SourceTexture> textures(model.numTextures);
 
     for (CountT i = 0; i < model.numTextures; i++) {
         // TODO: NChans is not used.
@@ -505,9 +502,9 @@ static RTAssets loadRenderObjects(
         };
     }
 
-    // Use HeapArray instead of std::vector since size is known upfront
+    // Use HeapArray with default allocator - automatically cleaned up when it goes out of scope
     // This avoids dynamic reallocation and provides better performance
-    HeapArray<imp::SourceMaterial> materials(model.numMats, tmp_alloc);
+    HeapArray<imp::SourceMaterial> materials(model.numMats);
     for (CountT i = 0; i < model.numMats; i++) {
         const math::Vector4 &rgba = model.matRGBA[i];
         uint32_t mat_tex_offset = model.matTexOffsets[i];

From 0e553183c9013bd1fdca16dc29257f00caeb3f01 Mon Sep 17 00:00:00 2001
From: Zhehuan Chen <50882714+ACMLCZH@users.noreply.github.com>
Date: Fri, 5 Dec 2025 05:34:13 -0500
Subject: [PATCH 11/11] fix(2)

---
 src/mw/cuda_exec.cpp           | 91 +++++++++++-----------------------
 src/render/asset_processor.cpp |  9 +++-
 2 files changed, 37 insertions(+), 63 deletions(-)

diff --git a/src/mw/cuda_exec.cpp b/src/mw/cuda_exec.cpp
index 193babea..0401db5e 100644
--- a/src/mw/cuda_exec.cpp
+++ b/src/mw/cuda_exec.cpp
@@ -1295,40 +1295,31 @@ static BVHKernels buildBVHKernels(const CompileConfig &cfg,
     }
 
     CUfunction bvh_build_fast;
-    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_build_fast, mod,
-                               "bvhBuildFast"));
+    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_build_fast, mod, "bvhBuildFast"));
 
     CUfunction bvh_build_slow;
-    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_build_slow, mod,
-                               "bvhBuildSlow"));
+    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_build_slow, mod, "bvhBuildSlow"));
 
     CUfunction bvh_init;
-    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_init, mod,
-                               "bvhInit"));
+    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_init, mod, "bvhInit"));
 
     CUfunction bvh_alloc;
-    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_alloc, mod,
-                               "bvhAllocInternalNodes"));
+    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_alloc, mod, "bvhAllocInternalNodes"));
 
     CUfunction bvh_aabbs;
-    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_aabbs, mod,
-                               "bvhConstructAABBs"));
+    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_aabbs, mod, "bvhConstructAABBs"));
 
     CUfunction widen_tree;
-    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&widen_tree, mod,
-                               "bvhWidenTree"));
+    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&widen_tree, mod, "bvhWidenTree"));
 
     CUfunction bvh_opt;
-    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_opt, mod,
-                               "bvhOptimizeLBVH"));
+    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_opt, mod, "bvhOptimizeLBVH"));
 
     CUfunction bvh_debug;
-    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_debug, mod,
-                               "bvhDebug"));
+    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_debug, mod, "bvhDebug"));
 
     CUfunction bvh_raycast_entry;
-    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_raycast_entry, mod,
-                               "bvhRaycastEntry"));
+    REQ_CU(CudaDynamicLoader::cuModuleGetFunction(&bvh_raycast_entry, mod, "bvhRaycastEntry"));
 
     CUevent alloc_event;
     CUevent build_event;
@@ -1364,8 +1355,7 @@ static BVHKernels buildBVHKernels(const CompileConfig &cfg,
         .stopEvent = stop_event,
         .recordedTimings = {},
         .timingInfo = timing_info,
-        .raycastRGBD = (uint32_t)(render_mode == 
-                CudaBatchRenderConfig::RenderMode::RGBD),
+        .raycastRGBD = (uint32_t)(render_mode == CudaBatchRenderConfig::RenderMode::RGBD),
         .meshBVHData = {},
         .materialData = {},
     };
@@ -1417,13 +1407,9 @@ static GPUKernels buildKernels(const CompileConfig &cfg,
            sizeof(const char *) * cfg.userSources.size());
 
     // Build architecture string for this GPU
-    string gpu_arch_str = "sm_" + to_string(cuda_arch.first) +
-        to_string(cuda_arch.second);
-
+    string gpu_arch_str = "sm_" + to_string(cuda_arch.first) + to_string(cuda_arch.second);
     string gpu_arch_flag = std::string("-arch=") + gpu_arch_str;
-
-    string num_sms_str =
-        "-DMADRONA_MWGPU_NUM_SMS=(" + to_string(num_sms) + "_i32)";
+    string num_sms_str = "-DMADRONA_MWGPU_NUM_SMS=(" + to_string(num_sms) + "_i32)";
 
     CountT max_megakernel_blocks_per_sm = 1;
     for (const MegakernelConfig &megakernel_cfg : megakernel_cfgs) {
@@ -1821,8 +1807,7 @@ static GPUEngineState initEngineAndUserState(
                               0, strm, nullptr, args.data()));
     };
 
-    uint64_t num_init_bytes =
-        (uint64_t)num_world_init_bytes * (uint64_t)num_worlds;
+    uint64_t num_init_bytes = (uint64_t)num_world_init_bytes * (uint64_t)num_worlds;
     auto init_tmp_buffer = cu::allocGPU(num_init_bytes);
     REQ_CUDA(cudaMemcpyAsync(init_tmp_buffer, world_init_ptr,
         num_init_bytes, cudaMemcpyHostToDevice, strm));
@@ -1831,14 +1816,9 @@ static GPUEngineState initEngineAndUserState(
     REQ_CUDA(cudaMemcpyAsync(user_cfg_gpu_buffer, user_cfg_host_ptr,
         num_user_cfg_bytes, cudaMemcpyHostToDevice, strm));
 
-    auto gpu_consts_readback = (GPUImplConsts *)cu::allocReadback(
-        sizeof(GPUImplConsts));
-
-    auto gpu_state_size_readback = (size_t *)cu::allocReadback(
-        sizeof(size_t));
-
-    auto exported_readback = (void **)cu::allocReadback(
-        sizeof(void *) * num_exported);
+    auto gpu_consts_readback = (GPUImplConsts *)cu::allocReadback(sizeof(GPUImplConsts));
+    auto gpu_state_size_readback = (size_t *)cu::allocReadback(sizeof(size_t));
+    auto exported_readback = (void **)cu::allocReadback(sizeof(void *) * num_exported);
 
     CUdeviceptr allocator_channel_devptr;
     REQ_CU(CudaDynamicLoader::cuMemAllocManaged(&allocator_channel_devptr,
@@ -1864,16 +1844,14 @@ static GPUEngineState initEngineAndUserState(
     }
 
     HostAllocInit alloc_init {
-        std::max((uint64_t)sysconf(_SC_PAGESIZE),
-                 (uint64_t)cu_va_alloc_granularity),
+        std::max((uint64_t)sysconf(_SC_PAGESIZE), (uint64_t)cu_va_alloc_granularity),
         (uint64_t)cu_va_alloc_granularity,
         allocator_channel,
     };
 
     FreeQueue *fq = new FreeQueue {};
 
-    std::thread allocator_thread(
-        gpuVMAllocatorThread, allocator_channel, cu_ctx, fq);
+    std::thread allocator_thread(gpuVMAllocatorThread, allocator_channel, cu_ctx, fq);
 
     auto host_print = std::make_unique<HostPrintCPU>(cu_gpu);
 
@@ -1889,8 +1867,7 @@ static GPUEngineState initEngineAndUserState(
         raycast_output_width = render_cfg->renderWidth;
         raycast_output_height = render_cfg->renderHeight;
 
-        bvh_internals = cu::allocGPU(
-            sizeof(mwGPU::madrona::BVHInternalData));
+        bvh_internals = cu::allocGPU(sizeof(mwGPU::madrona::BVHInternalData));
     } else {
         bvh_ptrs = nullptr;
         num_bvhs = 0;
@@ -1919,8 +1896,7 @@ static GPUEngineState initEngineAndUserState(
                                              exported_readback,
                                              user_cfg_gpu_buffer);
 
-    auto init_tasks_args = makeKernelArgBuffer(
-        num_taskgraphs, user_cfg_gpu_buffer);
+    auto init_tasks_args = makeKernelArgBuffer(num_taskgraphs, user_cfg_gpu_buffer);
 
     auto init_worlds_args = makeKernelArgBuffer(num_worlds,
         user_cfg_gpu_buffer,
@@ -1928,8 +1904,7 @@ static GPUEngineState initEngineAndUserState(
 
     auto no_args = makeKernelArgBuffer();
 
-    launchKernel(gpu_kernels.computeGPUImplConsts, 1, 1,
-                 compute_consts_args);
+    launchKernel(gpu_kernels.computeGPUImplConsts, 1, 1, compute_consts_args);
 
     REQ_CUDA(cudaStreamSynchronize(strm));
 
@@ -1984,11 +1959,8 @@ static GPUEngineState initEngineAndUserState(
                         job_sys_consts_size));
 
     if (exec_mode == ExecutorMode::JobSystem) {
-        launchKernel(gpu_kernels.initWorlds, 1, consts::numMegakernelThreads,
-                     no_args);
-    
-        uint32_t num_queue_blocks =
-            utils::divideRoundUp(num_worlds, consts::numEntryQueueThreads);
+        launchKernel(gpu_kernels.initWorlds, 1, consts::numMegakernelThreads, no_args);
+        uint32_t num_queue_blocks = utils::divideRoundUp(num_worlds, consts::numEntryQueueThreads);
 
         launchKernel(gpu_kernels.queueUserInit, num_queue_blocks,
                      consts::numEntryQueueThreads, init_worlds_args); 
@@ -1997,9 +1969,7 @@ static GPUEngineState initEngineAndUserState(
                      consts::numMegakernelThreads, no_args);
     } else if (exec_mode == ExecutorMode::TaskGraph) {
         launchKernel(gpu_kernels.initECS, 1, 1, init_ecs_args);
-
-        uint32_t num_init_blocks =
-            utils::divideRoundUp(num_worlds, consts::numMegakernelThreads);
+        uint32_t num_init_blocks = utils::divideRoundUp(num_worlds, consts::numMegakernelThreads);
 
         launchKernel(gpu_kernels.initWorlds, num_init_blocks,
                      consts::numMegakernelThreads, init_worlds_args);
@@ -2012,8 +1982,7 @@ static GPUEngineState initEngineAndUserState(
     cu::deallocGPU(init_tmp_buffer);
 
     HeapArray<void *> exported_cols(num_exported);
-    memcpy(exported_cols.data(), exported_readback,
-           sizeof(void *) * (uint64_t)num_exported);
+    memcpy(exported_cols.data(), exported_readback, sizeof(void *) * (uint64_t)num_exported);
 
     cu::deallocCPU(exported_readback);
 
@@ -2037,20 +2006,19 @@ static GPUEngineState initEngineAndUserState(
             bvh_kernels.timingInfo,
             render_cfg->materialData.materials,
             render_cfg->materialData.textures,
+            render_cfg->materialData.materialTextures,
             (uint32_t)num_sms,
             (uint32_t)shared_mem_per_sm);
 
         // Launch the kernel in the megakernel module to initialize the BVH 
         // params
-        launchKernel(gpu_kernels.initBVHParams, 1, 1,
-                     init_bvh_args);
+        launchKernel(gpu_kernels.initBVHParams, 1, 1, init_bvh_args);
 
         REQ_CUDA(cudaStreamSynchronize(strm));
 
         // Call the bvh init function from bvh module
         auto bvh_init_internal_args = makeKernelArgBuffer(alloc_init);
-        launchKernel(bvh_kernels.init, 1, 1,
-                     no_args);
+        launchKernel(bvh_kernels.init, 1, 1, no_args);
 
         REQ_CUDA(cudaStreamSynchronize(strm));
 
@@ -2210,8 +2178,7 @@ static DynArray<int32_t> processExecConfigFile(
         REQ_JSON(kv.value().get(num_blocks));
 
         uint64_t node_idx;
-        auto res = std::from_chars(key.data(), key.data() + key.size(),
-                                   node_idx);
+        auto res = std::from_chars(key.data(), key.data() + key.size(), node_idx);
         
         if (res.ec != std::errc {}) {
             FATAL("MADRONA_MWGPU_EXEC_CONFIG_FILE points to invalid file");
diff --git a/src/render/asset_processor.cpp b/src/render/asset_processor.cpp
index d1f62ee3..0a4b4c7b 100644
--- a/src/render/asset_processor.cpp
+++ b/src/render/asset_processor.cpp
@@ -14,6 +14,7 @@
 #include <span>
 #include <array>
 #include <algorithm>
+#include <cstring>
 
 using bytes = std::span<const std::byte>;
 
@@ -441,7 +442,7 @@ MaterialData initMaterialData(
         }
     }
 
-    // Populate materials array
+    // Populate materials and material_textures array
     int32_t material_texture_offset = 0;
     for (uint32_t i = 0; i < num_materials; ++i) {
         const SourceMaterial &src_mat = materials[i];
@@ -452,6 +453,12 @@ MaterialData initMaterialData(
             .roughness = src_mat.roughness,
             .metalness = src_mat.metalness,
         };
+        
+        if (src_mat.textureIdx != nullptr && src_mat.numTextures > 0) {
+            memcpy(&cpu_mat_data.materialTextures[material_texture_offset],
+                   src_mat.textureIdx,
+                   sizeof(int32_t) * src_mat.numTextures);
+        }
         material_texture_offset += src_mat.numTextures;
     }