q_shared: better magic constant for the Q_rsqrt trick

illwieckz · illwieckz · commit 95fa8f85bc80 · 2024-12-10T13:16:30.000+01:00
diff --git a/src/engine/qcommon/q_shared.h b/src/engine/qcommon/q_shared.h
@@ -338,29 +338,6 @@ extern const quat_t   quatIdentity;
 
 #define Q_ftol(x) ((long)(x))
 
-	// Overall relative error bound (ignoring unknown powerpc case): 5 * 10^-6
-	// https://en.wikipedia.org/wiki/Fast_inverse_square_root#/media/File:2nd-iter.png
-	inline float Q_rsqrt( float number )
-	{
-		// compute approximate inverse square root
-#if defined(DAEMON_USE_ARCH_INTRINSICS_i686_sse)
-		float y;
-		// SSE rsqrt relative error bound: 3.7 * 10^-4
-		_mm_store_ss( &y, _mm_rsqrt_ss( _mm_load_ss( &number ) ) );
-#else
-		float x = 0.5f * number;
-		float y = Util::bit_cast<float>( 0x5f3759df - ( Util::bit_cast<uint32_t>( number ) >> 1 ) );
-		// initial iteration
-		// relative error bound after the initial iteration: 1.8 * 10^-3
-		y *= ( 1.5f - ( x * y * y ) );
-#if 0
-		// second iteration for higher precision
-		y *= ( 1.5f - ( x * y * y ) );
-#endif
-#endif
-		return y;
-	}
-
 inline float Q_fabs( float x )
 {
 	return fabsf( x );
@@ -498,6 +475,99 @@ void SnapVector( V &&v )
 	v[ 2 ] = roundf( v[ 2 ] );
 }
 
+/* The original Q_rsqrt algorithm is:
+
+typedef union { float f; uint32_t u; } uf_t;
+float Q_rsqrt( float n )
+{
+	uint32_t magic = 0x5f3759dful;
+	float a = 0.5f;
+	float b = 3.0f;
+	uf_t o; o.f = n;
+	o.u = magic - ( o.u >> 1 );
+	return a * o.f * ( b - n * o.f * o.f );
+}
+
+It could be written like this, this is what Quake 3 did:
+
+float Q_rsqrt( float n )
+{
+	uint32_t magic = 0x5f3759dful;
+	float a = 0.5f;
+	float b = 3.0f;
+	float c = a * b; // 1.5f
+	uf_t o = n;
+	o.u = magic - ( o.u >> 1);
+	float x = n * a;
+	return o.f * a ( c - ( x * o.f * o.f ) );
+}
+
+It was written with a second iteration commented out:
+
+float Q_rsqrt( float n )
+{
+	uint32_t magic = 0x5f3759dful;
+	float a = 0.5f;
+	float b = 3.0f;
+	float c = a * b; // 1.5f
+	uf_t o; o.f = n;
+	o.u = magic - ( o.u >> 1);
+	float x = n * a;
+	o.f *= a * ( c - ( x * o.f * o.f ) );
+//	o.f *= a * ( c - ( x * o.f * o.f ) );
+	return o.f;
+}
+
+The relative error bound after the initial iteration was: 1.8×10⁻³
+The relative error bound after a second iteration was: 5×10⁻⁶
+
+Better values are usable from: http://rrrola.wz.cz/inv_sqrt.html
+
+float Q_rsqrt( float n )
+{
+	uint32_t magic = 0x5f1ffff9ul:
+	float a = 0.703952253f;
+	float b = 2.38924456f;
+	uf_t o; o.f = n;
+	o.u = magic - ( o.u >> 1 );
+	return a * o.f * ( b - n * y.f * y.f );
+}
+
+The relative error bound after the initial iteration is: 2.00010826×10⁻⁷ */
+
+#define Q_RSQRT_QUAKE3_CONSTANTS 0
+#define Q_RSQRT_DOUBLE_ITERATION 0
+
+#if Q_RSQRT_QUAKE3_CONSTANTS
+// Constants used in Quake 3.
+static uint32_t qrsqrt_magic = 0x5f3759dful;
+static float qrsqrt_a = 0.5f;
+static float qrsqrt_b = 3.0f;
+#else
+// Constants computed by Řrřola.
+static uint32_t qrsqrt_magic = 0x5f1ffff9ul;
+static float qrsqrt_a = 0.703974056f;
+static float qrsqrt_b = 2.38919526f;
+#endif
+
+inline float Q_rsqrt( float n )
+{
+	// Compute approximate inverse square root.
+#if defined(DAEMON_USE_ARCH_INTRINSICS_i686_sse)
+	float o;
+	// SSE rsqrt relative error bound: 3.7 * 10^-4
+	_mm_store_ss( &o, _mm_rsqrt_ss( _mm_load_ss( &n ) ) );
+#else
+	float o = Util::bit_cast<float>( qrsqrt_magic - ( Util::bit_cast<uint32_t>( n ) >> 1 ) );
+	o *= qrsqrt_a * ( qrsqrt_b - n * o * o );
+#if Q_RSQRT_DOUBLE_ITERATION
+	// Second iteration for higher precision.
+	o *= qsqrt_a * ( qsqrt_b - n * o * o );
+#endif
+#endif
+	return o;
+}
+
 #define VectorLerpTrem( f, s, e, r ) (( r )[ 0 ] = ( s )[ 0 ] + ( f ) * (( e )[ 0 ] - ( s )[ 0 ] ), \
                                       ( r )[ 1 ] = ( s )[ 1 ] + ( f ) * (( e )[ 1 ] - ( s )[ 1 ] ), \
                                       ( r )[ 2 ] = ( s )[ 2 ] + ( f ) * (( e )[ 2 ] - ( s )[ 2 ] ))