@@ -106,7 +106,19 @@ static inline void fillRGB565Span(uint16_t* framebuffer, int32_t bufferIndex, in
106106 : : [v] " r" (color32)
107107 );
108108 const int stride = 16 ;
109- for (int32_t i = 0 ; i < quads; ++i) {
109+ // 4x-unrolled: one loop iteration retires 64 bytes. The asm
110+ // blocks are volatile with a memory clobber, so GCC can't
111+ // unroll them itself — do it by hand to amortise the branch.
112+ for (int32_t i = quads >> 2 ; i > 0 ; --i) {
113+ __asm__ volatile (
114+ " ee.vst.128.xp q0, %[p], %[s]\n\t "
115+ " ee.vst.128.xp q0, %[p], %[s]\n\t "
116+ " ee.vst.128.xp q0, %[p], %[s]\n\t "
117+ " ee.vst.128.xp q0, %[p], %[s]\n\t "
118+ : [p] " +r" (out) : [s] " r" (stride) : " memory"
119+ );
120+ }
121+ for (int32_t i = quads & 3 ; i > 0 ; --i) {
110122 __asm__ volatile (
111123 " ee.vst.128.xp q0, %[p], %[s]\n\t "
112124 : [p] " +r" (out) : [s] " r" (stride) : " memory"
@@ -333,9 +345,9 @@ namespace Renderer
333345 }
334346
335347 bool PERF_CRITICAL Rasterizer::drawTriangle (
336- const Object::Vertex &v1,
337- const Object::Vertex &v2,
338- const Object::Vertex &v3,
348+ const RenderVertex &v1,
349+ const RenderVertex &v2,
350+ const RenderVertex &v3,
339351 Material *material,
340352 DirectionalLight *directionalLight,
341353 AmbientLight *ambientLight,
@@ -344,7 +356,8 @@ namespace Renderer
344356 bool noWriteZBuffer,
345357 int zBias,
346358 uint8_t objAlpha,
347- bool brightnessPrecomputed)
359+ bool brightnessPrecomputed,
360+ int32_t avgZHint)
348361 {
349362 // Fold per-object fade alpha into the material alpha up front so
350363 // every downstream alpha decision (early-out, depth fog, stipple
@@ -360,6 +373,8 @@ namespace Renderer
360373 return false ;
361374 }
362375
376+ (void )avgZHint; // Only consumed on the FAST_Z && !LAZY_Z path.
377+
363378#if TEXTURE_MAPPING
364379 Texture *diffuseMap = material->diffuseMap ;
365380#endif
@@ -413,14 +428,30 @@ namespace Renderer
413428
414429#if FAST_Z
415430#if LAZY_Z
431+ // LAZY_Z needs the max Z, not the average — the caller's avgZ hint
432+ // doesn't apply here.
416433 int32_t z = std::max ({v1.position .z , v2.position .z , v3.position .z });
417- #else
418- int32_t z = (v1.position .z + v2.position .z + v3.position .z ) / 3 ;
419- #endif
420434 if (z < nearPlane || z > farPlane)
421435 {
422436 return false ;
423437 }
438+ #else
439+ int32_t z;
440+ if (avgZHint != INT32_MIN )
441+ {
442+ // Scene::emitTri computed this exact average at queue time and
443+ // already culled against near/far — trust it and skip both.
444+ z = avgZHint;
445+ }
446+ else
447+ {
448+ z = (v1.position .z + v2.position .z + v3.position .z ) / 3 ;
449+ if (z < nearPlane || z > farPlane)
450+ {
451+ return false ;
452+ }
453+ }
454+ #endif
424455 #if Z_BUFFERING
425456 // Z buffer stores raw int32_t z, narrowed to uint16_t. The project's
426457 // farPlane (~4096) fits comfortably; we clamp to UINT16_MAX so any
@@ -494,7 +525,7 @@ namespace Renderer
494525 }
495526 else
496527 {
497- const Object::Vertex * verts[3 ] = { &v1, &v2, &v3 };
528+ const RenderVertex * verts[3 ] = { &v1, &v2, &v3 };
498529 for (int i = 0 ; i < 3 ; i++)
499530 {
500531 vertexBrightness[i] = jetShadeBrightness (
@@ -720,13 +751,18 @@ namespace Renderer
720751 int32_t oneOverZ2 = (FIXED_POINT_SCALE * FIXED_POINT_SCALE ) / v2.position .z ;
721752 int32_t oneOverZ3 = (FIXED_POINT_SCALE * FIXED_POINT_SCALE ) / v3.position .z ;
722753
723- // Precompute u_over_z and v_over_z at each vertex
754+ #if TEXTURE_MAPPING
755+ // Precompute u_over_z and v_over_z at each vertex. Only with
756+ // TEXTURE_MAPPING: RenderVertex carries uv only in textured builds.
757+ // (PERSPECTIVE_CORRECT_TEXTURES without TEXTURE_MAPPING is still a
758+ // valid config — it drives perspective-correct PHONG normals.)
724759 int32_t uOverZ1 = (v1.uv .x * oneOverZ1) / FIXED_POINT_SCALE ;
725760 int32_t vOverZ1 = (v1.uv .y * oneOverZ1) / FIXED_POINT_SCALE ;
726761 int32_t uOverZ2 = (v2.uv .x * oneOverZ2) / FIXED_POINT_SCALE ;
727762 int32_t vOverZ2 = (v2.uv .y * oneOverZ2) / FIXED_POINT_SCALE ;
728763 int32_t uOverZ3 = (v3.uv .x * oneOverZ3) / FIXED_POINT_SCALE ;
729764 int32_t vOverZ3 = (v3.uv .y * oneOverZ3) / FIXED_POINT_SCALE ;
765+ #endif // TEXTURE_MAPPING
730766
731767#if LIGHTING
732768 // Precompute normal_component / z at each vertex so Phong shading
@@ -1033,7 +1069,16 @@ namespace Renderer
10331069 // across resolutions: 28 → 28 px on 800p desktop,
10341070 // 28 → ~8 px on 240p ESP32.
10351071 const int amp = (int )material->specular * screenHeight / 2400 ; // /3 of original
1036- const int angle = ((y * 5 + (int )(waterTime * 240 .0f )) % 360 + 360 ) % 360 ;
1072+ // Perspective-correct wave density: waves should appear compressed
1073+ // (denser) near the horizon where geometry is far away, and stretched
1074+ // (sparser) near the camera. (screenHeight - y) is large at the top of
1075+ // the screen (horizon side) and approaches 0 at the very bottom (camera
1076+ // side), so squaring it produces the right quadratic phase accumulation —
1077+ // many wave cycles near the waterline, fewer toward the viewer.
1078+ // Cost: one extra multiply and a divide versus the old linear y*5.
1079+ const int perspY = screenHeight - y;
1080+ const int angle = ((perspY * perspY * 20 / screenHeight + (int )(waterTime * 240 .0f )) % 360 + 360 ) % 360 ;
1081+ // const int angle = ((y * 5 + (int)(waterTime * 240.0f)) % 360 + 360) % 360; //MB: This version doesn't take perspective into account, so waves look the same near and far, which is less realistic but more performant.
10371082 const int ripple = (lookupSinI (angle) * amp) >> 10 ; // Q10 → pixels
10381083 // Use the camera-pitch-correct waterline as the mirror axis:
10391084 // mirrorY = 2*waterlineY - y gives geometrically accurate
0 commit comments