Skip to content

Commit 2c0d016

Browse files
author
CubeCoders
committed
Performance improvements: Gate unneeded fields in RenderVertex behind feature flags to reduce memory bandwidth pressure, add ability to bake directional lighting into static objects, RenderOrder re-orders a list of indices rather than the actual triangles, faster AABB culling, manually unrolled loop on ESP32 for SIMD copying. Roughly 30% perf improvement in showcase.
1 parent c380978 commit 2c0d016

8 files changed

Lines changed: 434 additions & 112 deletions

File tree

src/Material.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class Material {
3838
///< biasing the reflection axis upward to align with the true
3939
///< waterline. Stored in device-native pixels (no resolution scaling).
4040
///< Set via WaterSurface::setReflectionMode(alpha, rippleAmp, yBias).
41-
ShadingMode shadingMode = ShadingMode::GOURAUD; ///< Shading model.
41+
ShadingMode shadingMode = ShadingMode::FLAT; ///< Shading model.
4242
char* name; ///< Optional name; used for `usemtl` matching in OBJ loading.
4343

4444
/// @brief Construct a material.

src/Object.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "Object.hpp"
2+
#include "Light.hpp"
23
#include "TrigLUT.hpp"
34
#include <cmath>
45

@@ -212,4 +213,68 @@ namespace Renderer
212213
{
213214
lookAt(targetObject->position);
214215
}
216+
void Object::bakeFlatLighting(const DirectionalLight* directionalLight,
217+
const AmbientLight* ambientLight)
218+
{
219+
// Ambient components (0 if no ambient light).
220+
const uint32_t ambR = ambientLight ? ambientLight->color.r : 0;
221+
const uint32_t ambG = ambientLight ? ambientLight->color.g : 0;
222+
const uint32_t ambB = ambientLight ? ambientLight->color.b : 0;
223+
224+
for (auto& tri : triangles) {
225+
tri.colorBaked = false;
226+
if (!tri.material) continue;
227+
if (tri.material->diffuseMap) continue; // textured — skip
228+
const ShadingMode mode = tri.material->shadingMode;
229+
if (mode != ShadingMode::FLAT &&
230+
mode != ShadingMode::GOURAUD &&
231+
mode != ShadingMode::UNLIT) continue;
232+
if (mode == ShadingMode::UNLIT ||
233+
(!directionalLight && !ambientLight)) {
234+
// Already emissive / no lights — just stamp the raw colour.
235+
tri.bakedColor = tri.material->color;
236+
tri.colorBaked = true;
237+
continue;
238+
}
239+
240+
// Compute brightness from the face normal (v1 for FLAT/GOURAUD).
241+
// Use the same squared-falloff Lambert as jetShadeBrightness.
242+
uint32_t brightness = 0;
243+
if (directionalLight) {
244+
const Vector3& N = vertices[tri.v1].normal;
245+
const Vector3& L = directionalLight->worldLightDir;
246+
int64_t dot = (int64_t)N.x * L.x + (int64_t)N.y * L.y + (int64_t)N.z * L.z;
247+
if (dot > 0) {
248+
uint32_t lambert = (uint32_t)(dot >> 12);
249+
if (lambert > 255) lambert = 255;
250+
lambert = (lambert * lambert + 128) >> 8; // squared falloff
251+
lambert = (lambert * (uint32_t)directionalLight->intensity) >> 8;
252+
brightness = (lambert * (uint32_t)tri.material->diffuse) >> 8;
253+
const uint32_t maxBrightness = 255u + tri.material->specular;
254+
if (brightness > maxBrightness) brightness = maxBrightness;
255+
}
256+
}
257+
258+
// Per-channel modulation (mirrors jetModulateRGB565 in Renderer.cpp).
259+
const uint16_t base = tri.material->color;
260+
const uint32_t maxBrightness = 255u + tri.material->specular;
261+
const uint32_t tR = std::min(brightness + ambR, maxBrightness);
262+
const uint32_t tG = std::min(brightness + ambG, maxBrightness);
263+
const uint32_t tB = std::min(brightness + ambB, maxBrightness);
264+
auto ch5 = [](uint32_t base5, uint32_t t) -> uint32_t {
265+
if (t > 255) { const uint32_t blow = t-255; return base5 + ((31u-base5)*blow)/256u; }
266+
const uint32_t v = base5 * t; return (v + 128u + (v>>8)) >> 8;
267+
};
268+
auto ch6 = [](uint32_t base6, uint32_t t) -> uint32_t {
269+
if (t > 255) { const uint32_t blow = t-255; return base6 + ((63u-base6)*blow)/256u; }
270+
const uint32_t v = base6 * t; return (v + 128u + (v>>8)) >> 8;
271+
};
272+
const uint32_t r = ch5((base >> 11) & 0x1Fu, tR);
273+
const uint32_t g = ch6((base >> 5) & 0x3Fu, tG);
274+
const uint32_t b = ch5( base & 0x1Fu, tB);
275+
tri.bakedColor = (uint16_t)((r << 11) | (g << 5) | b);
276+
tri.colorBaked = true;
277+
}
278+
}
279+
215280
} // namespace Renderer

src/Object.hpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99

1010
namespace Renderer {
1111

12+
class DirectionalLight;
13+
class AmbientLight;
14+
1215
/// @brief Triangle culling mode for an Object.
1316
enum class CullingMode {
1417
CULL_BACKFACES, ///< Cull triangles facing away from the camera.
@@ -33,6 +36,11 @@ class Object {
3336
struct Triangle {
3437
uint16_t v1, v2, v3; ///< Vertex indices.
3538
Material* material; ///< Material applied to this face.
39+
/// @brief Pre-lit RGB565 colour baked by bakeFlatLighting().
40+
/// When `colorBaked` is true this overrides material->color and
41+
/// bypasses all per-frame lighting computation for this face.
42+
uint16_t bakedColor = 0;
43+
bool colorBaked = false;
3644
};
3745

3846
/// @brief Single mesh vertex.
@@ -239,6 +247,23 @@ class Object {
239247
/// @param numerator Scale numerator.
240248
/// @param denominator Scale denominator (must be non-zero).
241249
void bakeScale(int32_t numerator, int32_t denominator);
250+
251+
/// @brief Pre-compute FLAT shading into each triangle's `bakedColor` field
252+
/// so that subsequent render() calls cost nothing for lighting.
253+
///
254+
/// Only triangles whose material uses `ShadingMode::FLAT` (or UNLIT/GOURAUD
255+
/// when `forceFlat` is true) are processed; others are skipped.
256+
/// The original material pointers are preserved unchanged — baking does
257+
/// not allocate any new Material objects.
258+
///
259+
/// After calling this, set `LIGHTING=0` in JetConfig and the scene will
260+
/// render at full no-lighting speed while still looking correctly lit.
261+
/// If the light or ambient colour changes, call this again to rebuild.
262+
///
263+
/// @param directionalLight Directional light to bake from (may be nullptr).
264+
/// @param ambientLight Ambient light to bake from (may be nullptr).
265+
void bakeFlatLighting(const DirectionalLight* directionalLight,
266+
const AmbientLight* ambientLight);
242267
private:
243268

244269
};

src/ParticleSystem.hpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -342,11 +342,10 @@ class ParticleSystem {
342342

343343
// v0 = tip (velocity direction), v1/v2 = base ± perp.
344344
// noWriteZ so sparks don't occlude each other or later geometry.
345-
Object::Vertex v0, v1, v2;
345+
RenderVertex v0, v1, v2;
346346
v0.position = { tx, ty, tcZ };
347347
v1.position = { bx + perpX, by + perpY, camZ };
348348
v2.position = { bx - perpX, by - perpY, camZ };
349-
v0.color = v1.color = v2.color = mat.color;
350349

351350
raster->drawTriangle(v0, v1, v2, &mat,
352351
nullptr, nullptr,

src/Renderer.cpp

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,19 @@ static inline void fillRGB565Span(uint16_t* framebuffer, int32_t bufferIndex, in
106106
: : [v] "r"(color32)
107107
);
108108
const int stride = 16;
109-
for (int32_t i = 0; i < quads; ++i) {
109+
// 4x-unrolled: one loop iteration retires 64 bytes. The asm
110+
// blocks are volatile with a memory clobber, so GCC can't
111+
// unroll them itself — do it by hand to amortise the branch.
112+
for (int32_t i = quads >> 2; i > 0; --i) {
113+
__asm__ volatile (
114+
"ee.vst.128.xp q0, %[p], %[s]\n\t"
115+
"ee.vst.128.xp q0, %[p], %[s]\n\t"
116+
"ee.vst.128.xp q0, %[p], %[s]\n\t"
117+
"ee.vst.128.xp q0, %[p], %[s]\n\t"
118+
: [p] "+r"(out) : [s] "r"(stride) : "memory"
119+
);
120+
}
121+
for (int32_t i = quads & 3; i > 0; --i) {
110122
__asm__ volatile (
111123
"ee.vst.128.xp q0, %[p], %[s]\n\t"
112124
: [p] "+r"(out) : [s] "r"(stride) : "memory"
@@ -333,9 +345,9 @@ namespace Renderer
333345
}
334346

335347
bool PERF_CRITICAL Rasterizer::drawTriangle(
336-
const Object::Vertex &v1,
337-
const Object::Vertex &v2,
338-
const Object::Vertex &v3,
348+
const RenderVertex &v1,
349+
const RenderVertex &v2,
350+
const RenderVertex &v3,
339351
Material *material,
340352
DirectionalLight *directionalLight,
341353
AmbientLight *ambientLight,
@@ -344,7 +356,8 @@ namespace Renderer
344356
bool noWriteZBuffer,
345357
int zBias,
346358
uint8_t objAlpha,
347-
bool brightnessPrecomputed)
359+
bool brightnessPrecomputed,
360+
int32_t avgZHint)
348361
{
349362
// Fold per-object fade alpha into the material alpha up front so
350363
// every downstream alpha decision (early-out, depth fog, stipple
@@ -360,6 +373,8 @@ namespace Renderer
360373
return false;
361374
}
362375

376+
(void)avgZHint; // Only consumed on the FAST_Z && !LAZY_Z path.
377+
363378
#if TEXTURE_MAPPING
364379
Texture *diffuseMap = material->diffuseMap;
365380
#endif
@@ -413,14 +428,30 @@ namespace Renderer
413428

414429
#if FAST_Z
415430
#if LAZY_Z
431+
// LAZY_Z needs the max Z, not the average — the caller's avgZ hint
432+
// doesn't apply here.
416433
int32_t z = std::max({v1.position.z, v2.position.z, v3.position.z});
417-
#else
418-
int32_t z = (v1.position.z + v2.position.z + v3.position.z) / 3;
419-
#endif
420434
if (z < nearPlane || z > farPlane)
421435
{
422436
return false;
423437
}
438+
#else
439+
int32_t z;
440+
if (avgZHint != INT32_MIN)
441+
{
442+
// Scene::emitTri computed this exact average at queue time and
443+
// already culled against near/far — trust it and skip both.
444+
z = avgZHint;
445+
}
446+
else
447+
{
448+
z = (v1.position.z + v2.position.z + v3.position.z) / 3;
449+
if (z < nearPlane || z > farPlane)
450+
{
451+
return false;
452+
}
453+
}
454+
#endif
424455
#if Z_BUFFERING
425456
// Z buffer stores raw int32_t z, narrowed to uint16_t. The project's
426457
// farPlane (~4096) fits comfortably; we clamp to UINT16_MAX so any
@@ -494,7 +525,7 @@ namespace Renderer
494525
}
495526
else
496527
{
497-
const Object::Vertex* verts[3] = { &v1, &v2, &v3 };
528+
const RenderVertex* verts[3] = { &v1, &v2, &v3 };
498529
for (int i = 0; i < 3; i++)
499530
{
500531
vertexBrightness[i] = jetShadeBrightness(
@@ -720,13 +751,18 @@ namespace Renderer
720751
int32_t oneOverZ2 = (FIXED_POINT_SCALE * FIXED_POINT_SCALE) / v2.position.z;
721752
int32_t oneOverZ3 = (FIXED_POINT_SCALE * FIXED_POINT_SCALE) / v3.position.z;
722753

723-
// Precompute u_over_z and v_over_z at each vertex
754+
#if TEXTURE_MAPPING
755+
// Precompute u_over_z and v_over_z at each vertex. Only with
756+
// TEXTURE_MAPPING: RenderVertex carries uv only in textured builds.
757+
// (PERSPECTIVE_CORRECT_TEXTURES without TEXTURE_MAPPING is still a
758+
// valid config — it drives perspective-correct PHONG normals.)
724759
int32_t uOverZ1 = (v1.uv.x * oneOverZ1) / FIXED_POINT_SCALE;
725760
int32_t vOverZ1 = (v1.uv.y * oneOverZ1) / FIXED_POINT_SCALE;
726761
int32_t uOverZ2 = (v2.uv.x * oneOverZ2) / FIXED_POINT_SCALE;
727762
int32_t vOverZ2 = (v2.uv.y * oneOverZ2) / FIXED_POINT_SCALE;
728763
int32_t uOverZ3 = (v3.uv.x * oneOverZ3) / FIXED_POINT_SCALE;
729764
int32_t vOverZ3 = (v3.uv.y * oneOverZ3) / FIXED_POINT_SCALE;
765+
#endif // TEXTURE_MAPPING
730766

731767
#if LIGHTING
732768
// Precompute normal_component / z at each vertex so Phong shading
@@ -1033,7 +1069,16 @@ namespace Renderer
10331069
// across resolutions: 28 → 28 px on 800p desktop,
10341070
// 28 → ~8 px on 240p ESP32.
10351071
const int amp = (int)material->specular * screenHeight / 2400; // /3 of original
1036-
const int angle = ((y * 5 + (int)(waterTime * 240.0f)) % 360 + 360) % 360;
1072+
// Perspective-correct wave density: waves should appear compressed
1073+
// (denser) near the horizon where geometry is far away, and stretched
1074+
// (sparser) near the camera. (screenHeight - y) is large at the top of
1075+
// the screen (horizon side) and approaches 0 at the very bottom (camera
1076+
// side), so squaring it produces the right quadratic phase accumulation —
1077+
// many wave cycles near the waterline, fewer toward the viewer.
1078+
// Cost: one extra multiply and a divide versus the old linear y*5.
1079+
const int perspY = screenHeight - y;
1080+
const int angle = ((perspY * perspY * 20 / screenHeight + (int)(waterTime * 240.0f)) % 360 + 360) % 360;
1081+
//const int angle = ((y * 5 + (int)(waterTime * 240.0f)) % 360 + 360) % 360; //MB: This version doesn't take perspective into account, so waves look the same near and far, which is less realistic but more performant.
10371082
const int ripple = (lookupSinI(angle) * amp) >> 10; // Q10 → pixels
10381083
// Use the camera-pitch-correct waterline as the mirror axis:
10391084
// mirrorY = 2*waterlineY - y gives geometrically accurate

src/Renderer.hpp

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,32 @@
2424

2525
namespace Renderer
2626
{
27+
/// @brief Slim render-time vertex flowing through the transform → queue →
28+
/// rasterise pipeline.
29+
///
30+
/// `Object::Vertex` is the authoring type: game/mesh code always has uv,
31+
/// normal, etc. available regardless of build configuration. RenderVertex
32+
/// is what the per-frame pipeline copies around (Scene's transformed-vertex
33+
/// scratch, the render queue, the painter's sort) — so it only carries the
34+
/// fields the configured pipeline actually consumes. With TEXTURE_MAPPING
35+
/// and LIGHTING both off this is just 12 bytes of position instead of 36,
36+
/// which roughly 3×'s the per-triangle queue/sort copy traffic savings.
37+
///
38+
/// `position` is screen-space x/y with camera-space z after projection.
39+
struct RenderVertex {
40+
Vector3 position = {0, 0, 0};
41+
#if TEXTURE_MAPPING
42+
Vector2 uv = {0, 0}; ///< Texture coordinates.
43+
#endif
44+
#if LIGHTING
45+
Vector3 normal = {0, 0, 0}; ///< View-space (or mesh-local, see lambertBrightness) normal.
46+
/// @brief Precomputed Lambert brightness for the object-local-light
47+
/// path (see Scene.cpp "objectLocalLight"). Only meaningful when the
48+
/// triangle was queued with brightnessPrecomputed == true.
49+
uint16_t lambertBrightness = 0;
50+
#endif
51+
};
52+
2753
/// @brief Low-level triangle rasteriser owning a colour and depth buffer.
2854
///
2955
/// Scene drives this class on every render(). External users normally
@@ -145,8 +171,15 @@ class Rasterizer
145171
/// @param noWriteZBuffer Skip the depth write when true.
146172
/// @param zBias Per-triangle depth bias in z-buffer units.
147173
/// @param objAlpha Per-object alpha multiplier (255 = no fade).
174+
/// @param brightnessPrecomputed v1/v2/v3.lambertBrightness already holds per-vertex brightness (object-local-light path).
175+
/// @param avgZHint Caller-supplied triangle average camera-space Z, or
176+
/// INT32_MIN (default) to compute it here. Scene::rasterizeBand
177+
/// passes the avgZ it already computed (and near/far-culled
178+
/// against) at queue time, so the FAST_Z setup can skip the
179+
/// recompute and the redundant near/far test. Ignored when
180+
/// LAZY_Z is enabled (LAZY_Z needs the max, not the average).
148181
/// @return True if the triangle produced any rasterizer work.
149-
bool drawTriangle(const Object::Vertex &v1, const Object::Vertex &v2, const Object::Vertex &v3, Material *material, DirectionalLight *directionalLight, AmbientLight *ambientLight, bool renderEvenLines, bool ignoreZBuffer, bool noWriteZBuffer, int zBias, uint8_t objAlpha = 255, bool brightnessPrecomputed = false);
182+
bool drawTriangle(const RenderVertex &v1, const RenderVertex &v2, const RenderVertex &v3, Material *material, DirectionalLight *directionalLight, AmbientLight *ambientLight, bool renderEvenLines, bool ignoreZBuffer, bool noWriteZBuffer, int zBias, uint8_t objAlpha = 255, bool brightnessPrecomputed = false, int32_t avgZHint = INT32_MIN);
150183

151184
/// @brief Map an 8-bit grayscale value to RGB565.
152185
/// @param grayscale 8-bit luminance.

0 commit comments

Comments
 (0)