diff --git a/src/engine/qcommon/q_math_test.cpp b/src/engine/qcommon/q_math_test.cpp index 1bb6a85ffa..2804b7ad66 100644 --- a/src/engine/qcommon/q_math_test.cpp +++ b/src/engine/qcommon/q_math_test.cpp @@ -219,4 +219,18 @@ TEST(QSharedMathTest, InverseSquareRoot) EXPECT_THAT(Q_rsqrt(1e6), RsqrtEq(1e-3)); } +TEST(QSharedMathTest, FastInverseSquareRoot) +{ + constexpr float relativeTolerance = 6.50196699e-4; + auto RsqrtEq = [=](float expected) { return FloatNear(expected, expected * relativeTolerance); }; + + EXPECT_THAT(Q_rsqrt_fast(1e-6), RsqrtEq(1e3)); + EXPECT_THAT(Q_rsqrt_fast(0.036), RsqrtEq(5.270463)); + EXPECT_THAT(Q_rsqrt_fast(0.2), RsqrtEq(2.236068)); + EXPECT_THAT(Q_rsqrt_fast(1), RsqrtEq(1)); + EXPECT_THAT(Q_rsqrt_fast(3), RsqrtEq(0.5773503)); + EXPECT_THAT(Q_rsqrt_fast(29.1), RsqrtEq(0.1853760)); + EXPECT_THAT(Q_rsqrt_fast(1e6), RsqrtEq(1e-3)); +} + } // namespace diff --git a/src/engine/qcommon/q_shared.h b/src/engine/qcommon/q_shared.h index f014947496..e5e6a2b1f4 100644 --- a/src/engine/qcommon/q_shared.h +++ b/src/engine/qcommon/q_shared.h @@ -338,25 +338,101 @@ extern const quat_t quatIdentity; #define Q_ftol(x) ((long)(x)) - // Overall relative error bound (ignoring unknown powerpc case): 5 * 10^-6 - // https://en.wikipedia.org/wiki/Fast_inverse_square_root#/media/File:2nd-iter.png - inline float Q_rsqrt( float number ) - { - float x = 0.5f * number; - float y; +/* The original Q_rsqrt algorithm is: + +float Q_rsqrt( float n ) +{ + uint32_t magic = 0x5f3759dful; + float a = 0.5f; + float b = 3.0f; + union { float f; uint32_t u; } o = {n}; + o.u = magic - ( o.u >> 1 ); + return a * o.f * ( b - n * o.f * o.f ); +} + +It could be written like this, this is what Quake 3 did: + +float Q_rsqrt( float n ) +{ + uint32_t magic = 0x5f3759dful; + float a = 0.5f; + float b = 3.0f; + float c = a * b; // 1.5f + union { float f; uint32_t u; } o = {n}; + o.u = magic - ( o.u >> 1); + float x = n * a; + return o.f * ( c - ( x * o.f * o.f ) ); + o.f *= c - ( x * o.f * o.f ); +// o.f *= c - ( x * o.f * o.f ); + return o.f; +} + +It was written with a second iteration commented out. + +The relative error bound after the initial iteration was: 1.8×10⁻³ +The relative error bound after a second iteration was: 5×10⁻⁶ + +The 0x5f3759df magic constant comes from the Quake 3 source code: +https://github.com/id-Software/Quake-III-Arena/blob/dbe4ddb/code/game/q_math.c#L56 + +That magic constant was good enough but better ones can be used. - // compute approximate inverse square root +Chris lomont computed a better magic constant of 0x5f375a86 while +keeping the other values of 0.5 and 3.0 for all iterations: +https://www.lomont.org/papers/2003/InvSqrt.pdf + +Jan Kadlec computed an ever better magic constant but it requires +different values for the first iteration: http://rrrola.wz.cz/inv_sqrt.html + +float Q_rsqrt( float n ) +{ + uint32_t magic = 0x5f1ffff9ul: + float a = 0.703952253f; + float b = 2.38924456f; + union { float f; uint32_t u; } o = {n}; + o.u = magic - ( o.u >> 1 ); + return a * o.f * ( b - n * y.f * y.f ); +} + +The relative error bound is: 6.50196699×10⁻⁴ */ + +// Compute approximate inverse square root. +inline float Q_rsqrt_fast( const float n ) +{ #if defined(DAEMON_USE_ARCH_INTRINSICS_i686_sse) - // SSE rsqrt relative error bound: 3.7 * 10^-4 - _mm_store_ss( &y, _mm_rsqrt_ss( _mm_load_ss( &number ) ) ); + float o; + // The SSE rsqrt relative error bound is 3.7×10⁻⁴. + _mm_store_ss( &o, _mm_rsqrt_ss( _mm_load_ss( &n ) ) ); #else - y = Util::bit_cast( 0x5f3759df - ( Util::bit_cast( number ) >> 1 ) ); - y *= ( 1.5f - ( x * y * y ) ); // initial iteration - // relative error bound after the initial iteration: 1.8 * 10^-3 + /* Magic constants by Jan Kadlec, with a relative error bound + of 6.50196699×10⁻⁴. + See: http://rrrola.wz.cz/inv_sqrt.html */ + constexpr float a = 0.703952253f; + constexpr float b = 2.38924456f; + constexpr uint32_t magic = 0x5f1ffff9ul; + float o = Util::bit_cast( magic - ( Util::bit_cast( n ) >> 1 ) ); + o *= a * ( b - n * o * o ); #endif - y *= ( 1.5f - ( x * y * y ) ); // second iteration for higher precision - return y; - } + return o; +} + +inline float Q_rsqrt( const float n ) +{ + /* When using the magic constants, the relative error bound after the + iteration is expected to be at most 5×10⁻⁶. It was achieved with the + less-good Quake 3 constants with a first iteration having originally + a relative error bound of 1.8×10⁻³. + Since the new magic constants provide a better relative error bound of + 6.50196699×10⁻⁴, the relative error bound is now expected to be smaller. + When using the SSE rsqrt, the initial error bound is 3.7×10⁻⁴ so after + the iteration it is also expected to be smaller. */ + constexpr float a = 0.5f; + constexpr float b = 3.0f; + float o = Q_rsqrt_fast( n ); + // Do an iteration of Newton's method for finding the zero of: f(x) = 1÷x² - n + o *= a * ( b - n * o * o ); + return o; +} inline float Q_fabs( float x ) { @@ -617,7 +693,7 @@ inline vec_t VectorNormalize( vec3_t v ) // that length != 0, nor does it return length inline void VectorNormalizeFast( vec3_t v ) { - vec_t ilength = Q_rsqrt( DotProduct( v, v ) ); + vec_t ilength = Q_rsqrt_fast( DotProduct( v, v ) ); VectorScale( v, ilength, v ); } diff --git a/src/engine/renderer/tr_main.cpp b/src/engine/renderer/tr_main.cpp index a70bf36a9b..1c9e785dbd 100644 --- a/src/engine/renderer/tr_main.cpp +++ b/src/engine/renderer/tr_main.cpp @@ -246,7 +246,7 @@ void R_TBNtoQtangents( const vec3_t tangent, const vec3_t binormal, if ( ( trace = tangent2[ 0 ] + binormal2[ 1 ] + normal2[ 2 ] ) > 0.0f ) { trace += 1.0f; - scale = 0.5f * Q_rsqrt( trace ); + scale = 0.5f * Q_rsqrt_fast( trace ); q[ 3 ] = trace * scale; q[ 2 ] = ( tangent2 [ 1 ] - binormal2[ 0 ] ) * scale; @@ -257,7 +257,7 @@ void R_TBNtoQtangents( const vec3_t tangent, const vec3_t binormal, else if ( tangent2[ 0 ] > binormal2[ 1 ] && tangent2[ 0 ] > normal2[ 2 ] ) { trace = tangent2[ 0 ] - binormal2[ 1 ] - normal2[ 2 ] + 1.0f; - scale = 0.5f * Q_rsqrt( trace ); + scale = 0.5f * Q_rsqrt_fast( trace ); q[ 0 ] = trace * scale; q[ 1 ] = ( tangent2 [ 1 ] + binormal2[ 0 ] ) * scale; @@ -268,7 +268,7 @@ void R_TBNtoQtangents( const vec3_t tangent, const vec3_t binormal, else if ( binormal2[ 1 ] > normal2[ 2 ] ) { trace = -tangent2[ 0 ] + binormal2[ 1 ] - normal2[ 2 ] + 1.0f; - scale = 0.5f * Q_rsqrt( trace ); + scale = 0.5f * Q_rsqrt_fast( trace ); q[ 1 ] = trace * scale; q[ 0 ] = ( tangent2 [ 1 ] + binormal2[ 0 ] ) * scale; @@ -279,7 +279,7 @@ void R_TBNtoQtangents( const vec3_t tangent, const vec3_t binormal, else { trace = -tangent2[ 0 ] - binormal2[ 1 ] + normal2[ 2 ] + 1.0f; - scale = 0.5f * Q_rsqrt( trace ); + scale = 0.5f * Q_rsqrt_fast( trace ); q[ 2 ] = trace * scale; q[ 3 ] = ( tangent2 [ 1 ] - binormal2[ 0 ] ) * scale;