diff --git a/doc/source/stdlib/handmade/function-builtin-ref_time_ticks-0xc4aeadf4dce1cb3f.rst b/doc/source/stdlib/handmade/function-builtin-ref_time_ticks-0xc4aeadf4dce1cb3f.rst index 37041594a..2cb58c312 100644 --- a/doc/source/stdlib/handmade/function-builtin-ref_time_ticks-0xc4aeadf4dce1cb3f.rst +++ b/doc/source/stdlib/handmade/function-builtin-ref_time_ticks-0xc4aeadf4dce1cb3f.rst @@ -1 +1 @@ -Captures the current high-resolution time in ticks, suitable for measuring elapsed intervals with `get_time_usec`. +Returns a monotonic timestamp in nanoseconds since an unspecified epoch (the same epoch within a process; not comparable across processes or reboots). Use with `get_time_usec(ref)` / `get_time_nsec(ref)` for elapsed-interval math; raw subtraction `now - then` is also valid since the unit is always nanoseconds on every platform. diff --git a/src/hal/performance_time.cpp b/src/hal/performance_time.cpp index 6d02f94ce..35541c504 100644 --- a/src/hal/performance_time.cpp +++ b/src/hal/performance_time.cpp @@ -15,36 +15,71 @@ namespace das { #endif +// ref_time_ticks() returns CLOCK_MONOTONIC-style nanoseconds on every platform. +// Prior to this normalization, Windows returned raw QueryPerformanceCounter +// ticks (~10 MHz typical) while Linux/macOS already returned ns — so callers +// that did `ref_time_ticks() + int64(timeout_sec * 1_000_000)` got 30 s on +// Windows (lucky math at 10 MHz) but 30 ms on POSIX. The footgun is gone: raw +// subtraction `now - then` always yields nanoseconds elapsed. +// +// Prefer `get_time_usec(start)` / `get_time_nsec(start)` for elapsed-time +// comparisons — those wrap the subtraction and continue to work portably. + #ifdef _MSC_VER #define WIN32_LEAN_AND_MEAN #include +// QueryPerformanceFrequency is invariant after boot — cache once per process +// to avoid a syscall on every ref_time_ticks() call. Race-tolerant: parallel +// initialisers all compute the same value, and int64 stores are atomic on +// x64/arm64. +// +// We also precompute `qpc_ns_per_tick = 1e9 / freq` when it divides cleanly +// (the universal Win 7+ case where QPF = 10 MHz → 100 ns/tick). The fast path +// is one multiply per call, so ref_time_ticks() stays within ~1 ns of the +// bare QueryPerformanceCounter cost — critical for the function profiler, +// which brackets every call. Fallback split path handles non-divisible +// frequencies (theoretical; not observed on modern Windows). +static int64_t qpc_ns_per_tick = 0; // 0 -> use the fallback split path + +static int64_t qpc_freq() { + static int64_t cached = 0; + if ( cached == 0 ) { + LARGE_INTEGER f; + QueryPerformanceFrequency(&f); + cached = f.QuadPart; + qpc_ns_per_tick = (1000000000LL % cached == 0) ? (1000000000LL / cached) : 0; + } + return cached; +} + extern "C" int64_t ref_time_ticks () { - LARGE_INTEGER t0; + LARGE_INTEGER t0; QueryPerformanceCounter(&t0); - return t0.QuadPart; + const int64_t freq = qpc_freq(); + if ( qpc_ns_per_tick ) { + return t0.QuadPart * qpc_ns_per_tick; + } + // Fallback: convert QPC counter to nanoseconds without overflowing int64: + // ns = (ticks / freq) * 1e9 + (ticks % freq) * 1e9 / freq + // freq is typically 10 MHz, so (ticks / freq) fits comfortably and the + // remainder * 1e9 also fits (max ~1e16, well under 2^63). + const int64_t whole = t0.QuadPart / freq; + const int64_t rem = t0.QuadPart % freq; + return whole * 1000000000LL + (rem * 1000000000LL) / freq; } extern "C" int get_time_usec ( int64_t reft ) { - int64_t t0 = ref_time_ticks(); - LARGE_INTEGER freq; - QueryPerformanceFrequency(&freq); - return int((t0-reft)*1000000LL/freq.QuadPart); + return int((ref_time_ticks() - reft) / 1000LL); } extern "C" int64_t get_time_nsec ( int64_t reft ) { - int64_t t0 = ref_time_ticks(); - LARGE_INTEGER freq; - QueryPerformanceFrequency(&freq); - return int64_t((t0-reft)*1000000000LL/freq.QuadPart); + return ref_time_ticks() - reft; } -extern "C" int64_t ref_time_delta_to_usec(int64_t ref) -{ - LARGE_INTEGER freq; - QueryPerformanceCounter(&freq); - return ref * 1000000LL/freq.QuadPart; +extern "C" int64_t ref_time_delta_to_usec ( int64_t ref ) { + return ref / 1000LL; } #elif __linux__ || defined(_EMSCRIPTEN_VER) || defined __HAIKU__ @@ -59,37 +94,36 @@ extern "C" int64_t ref_time_ticks () { DAS_ASSERT(false); return -1; } - return ts.tv_sec * NSEC_IN_SEC + ts.tv_nsec; } extern "C" int get_time_usec ( int64_t reft ) { - return int((ref_time_ticks() - reft) / (NSEC_IN_SEC/1000000LL)); + return int((ref_time_ticks() - reft) / 1000LL); } extern "C" int64_t get_time_nsec ( int64_t reft ) { - return ref_time_ticks() - reft; + return ref_time_ticks() - reft; } -extern "C" int64_t ref_time_delta_to_usec(int64_t ref) { return ref / (NSEC_IN_SEC/1000000LL); } +extern "C" int64_t ref_time_delta_to_usec ( int64_t ref ) { return ref / 1000LL; } #else // osx #include -extern "C" int64_t ref_time_ticks() { +extern "C" int64_t ref_time_ticks () { return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW); } extern "C" int get_time_usec ( int64_t reft ) { - return (clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW) - reft)/1000LL; + return int((ref_time_ticks() - reft) / 1000LL); } extern "C" int64_t get_time_nsec ( int64_t reft ) { - return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW) - reft; + return ref_time_ticks() - reft; } -extern "C" int64_t ref_time_delta_to_usec(int64_t ref) { return ref / 1000LL; } +extern "C" int64_t ref_time_delta_to_usec ( int64_t ref ) { return ref / 1000LL; } #endif