From fac2bddc8dc89c61a2f79ec533b21c1c39f8236b Mon Sep 17 00:00:00 2001 From: Kraionix Date: Fri, 21 Feb 2025 21:34:27 +0700 Subject: [PATCH] Merged feature/simd-memchr as a single commit. --- imgui.cpp | 109 ++++++++++++++++++++++++++++++++++++++++++++-- imgui_draw.cpp | 4 +- imgui_internal.h | 35 ++++++++++++--- imgui_widgets.cpp | 10 ++--- 4 files changed, 141 insertions(+), 17 deletions(-) diff --git a/imgui.cpp b/imgui.cpp index 4ba2051d5532..d39a5358a959 100644 --- a/imgui.cpp +++ b/imgui.cpp @@ -1943,6 +1943,107 @@ ImVec2 ImTriangleClosestPoint(const ImVec2& a, const ImVec2& b, const ImVec2& c, // [SECTION] MISC HELPERS/UTILITIES (String, Format, Hash functions) //----------------------------------------------------------------------------- +#if defined IMGUI_ENABLE_AVX2_IMMEMCHR +const void* ImMemchr(const void* buf, int val, size_t count) +{ + const size_t SIMD_LENGTH = 32; + const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1; + + const unsigned char* ptr = (const unsigned char*)buf; + const unsigned char* end = ptr + count; + const unsigned char* align_end = end - SIMD_LENGTH; + const unsigned char ch = (const unsigned char)val; + + if (ptr <= align_end) + { + const __m256i target = _mm256_set1_epi8(ch); + + if ((uintptr_t)ptr & SIMD_LENGTH_MASK) + { + __m256i chunk = _mm256_lddqu_si256((const __m256i*)ptr); + int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target)); + + if (mask) + return (const void*)(ptr + _tzcnt_u32(mask)); + + ptr = (const unsigned char*)_andn_u64(SIMD_LENGTH_MASK, (uintptr_t)ptr + SIMD_LENGTH_MASK); + } + + for (; ptr <= align_end; ptr += SIMD_LENGTH) + { + __m256i chunk = _mm256_load_si256((const __m256i*)ptr); + int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target)); + + if (mask) + return (const void*)(ptr + _tzcnt_u32(mask)); + + if (ptr <= end - 1024) + _mm_prefetch((const char*)(ptr + 1024), _MM_HINT_T0); + } + } + + for (; ptr < end; ptr++) + { + if (*ptr == ch) + return (const void*)(ptr); + } + + return nullptr; +} +#elif defined IMGUI_ENABLE_SSE_IMMEMCHR +const void* ImMemchr(const void* buf, int val, size_t count) +{ + const size_t SIMD_LENGTH = 16; + const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1; + + const unsigned char* ptr = (const unsigned char*)buf; + const unsigned char* end = ptr + count; + const unsigned char* align_end = end - SIMD_LENGTH; + const unsigned char ch = (const unsigned char)val; + + if (ptr <= align_end) + { + const __m128i target = _mm_set1_epi8(ch); + + if ((uintptr_t)ptr & SIMD_LENGTH_MASK) + { + __m128i chunk = _mm_lddqu_si128((const __m128i*)ptr); + int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target)); + + if (mask) + return (const void*)(ptr + _tzcnt_u32(mask)); + + ptr = (const unsigned char*)(((uintptr_t)ptr + SIMD_LENGTH_MASK) & ~SIMD_LENGTH_MASK); + } + + for (; ptr <= align_end; ptr += SIMD_LENGTH) + { + __m128i chunk = _mm_load_si128((const __m128i*)ptr); + int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target)); + + if (mask) + return (const void*)(ptr + _tzcnt_u32(mask)); + + if (ptr <= end - 1024) + _mm_prefetch((const char*)(ptr + 1024), _MM_HINT_T0); + } + } + + for (; ptr < end; ptr++) + { + if (*ptr == ch) + return (const void*)(ptr); + } + + return nullptr; +} +#else +const void* ImMemchr(const void* buf, int val, size_t count) +{ + return memchr(buf, val, count); +} +#endif + // Consider using _stricmp/_strnicmp under Windows or strcasecmp/strncasecmp. We don't actually use either ImStricmp/ImStrnicmp in the codebase any more. int ImStricmp(const char* str1, const char* str2) { @@ -1990,7 +2091,7 @@ char* ImStrdupcpy(char* dst, size_t* p_dst_size, const char* src) const char* ImStrchrRange(const char* str, const char* str_end, char c) { - const char* p = (const char*)memchr(str, (int)c, str_end - str); + const char* p = (const char*)ImMemchr(str, (int)c, str_end - str); return p; } @@ -2005,7 +2106,7 @@ int ImStrlenW(const ImWchar* str) // Find end-of-line. Return pointer will point to either first \n, either str_end. const char* ImStreolRange(const char* str, const char* str_end) { - const char* p = (const char*)memchr(str, '\n', str_end - str); + const char* p = (const char*)ImMemchr(str, '\n', str_end - str); return p ? p : str_end; } @@ -2554,7 +2655,7 @@ int ImTextCountLines(const char* in_text, const char* in_text_end) int count = 0; while (in_text < in_text_end) { - const char* line_end = (const char*)memchr(in_text, '\n', in_text_end - in_text); + const char* line_end = (const char*)ImMemchr(in_text, '\n', in_text_end - in_text); in_text = line_end ? line_end + 1 : in_text_end; count++; } @@ -2962,7 +3063,7 @@ void ImGuiTextIndex::append(const char* base, int old_size, int new_size) if (EndOffset == 0 || base[EndOffset - 1] == '\n') LineOffsets.push_back(EndOffset); const char* base_end = base + new_size; - for (const char* p = base + old_size; (p = (const char*)memchr(p, '\n', base_end - p)) != 0; ) + for (const char* p = base + old_size; (p = (const char*)ImMemchr(p, '\n', base_end - p)) != 0; ) if (++p < base_end) // Don't push a trailing offset on last \n LineOffsets.push_back((int)(intptr_t)(p - base)); EndOffset = ImMax(EndOffset, new_size); diff --git a/imgui_draw.cpp b/imgui_draw.cpp index 5bf0f0b1802a..7ce892ce7914 100644 --- a/imgui_draw.cpp +++ b/imgui_draw.cpp @@ -4151,7 +4151,7 @@ void ImFont::RenderText(ImDrawList* draw_list, float size, const ImVec2& pos, Im if (y + line_height < clip_rect.y) while (y + line_height < clip_rect.y && s < text_end) { - const char* line_end = (const char*)memchr(s, '\n', text_end - s); + const char* line_end = (const char*)ImMemchr(s, '\n', text_end - s); if (word_wrap_enabled) { // FIXME-OPT: This is not optimal as do first do a search for \n before calling CalcWordWrapPositionA(). @@ -4175,7 +4175,7 @@ void ImFont::RenderText(ImDrawList* draw_list, float size, const ImVec2& pos, Im float y_end = y; while (y_end < clip_rect.w && s_end < text_end) { - s_end = (const char*)memchr(s_end, '\n', text_end - s_end); + s_end = (const char*)ImMemchr(s_end, '\n', text_end - s_end); s_end = s_end ? s_end + 1 : text_end; y_end += line_height; } diff --git a/imgui_internal.h b/imgui_internal.h index f2aaeaa83d72..7db6795eea16 100644 --- a/imgui_internal.h +++ b/imgui_internal.h @@ -57,20 +57,43 @@ Index of this file: #include // sqrtf, fabsf, fmodf, powf, floorf, ceilf, cosf, sinf #include // INT_MIN, INT_MAX -// Enable SSE intrinsics if available -#if (defined __SSE__ || defined __x86_64__ || defined _M_X64 || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))) && !defined(IMGUI_DISABLE_SSE) +// Include compiler-specific intrinsics header +#if !defined(IMGUI_DISABLE_SIMD) +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) || defined(__clang__) +#include +#endif +#endif + +// Enable SIMD x86-64 intrinsics if available +#if (defined __x86_64__ || defined _M_X64) && !defined(IMGUI_DISABLE_SIMD) +#if (defined __SSE__ || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))) && !defined(IMGUI_DISABLE_SSE) #define IMGUI_ENABLE_SSE -#include -#if (defined __AVX__ || defined __SSE4_2__) +#endif +#if defined (__SSE4_2__) && !defined(IMGUI_DISABLE_SSE4_2) #define IMGUI_ENABLE_SSE4_2 -#include #endif +#if (defined __AVX__) && !defined(IMGUI_DISABLE_AVX) +#define IMGUI_ENABLE_AVX +#endif +#if (defined __AVX2__) && !defined(IMGUI_DISABLE_AVX2) +#define IMGUI_ENABLE_AVX2 #endif +#endif + // Emscripten has partial SSE 4.2 support where _mm_crc32_u32 is not available. See https://emscripten.org/docs/porting/simd.html#id11 and #8213 #if defined(IMGUI_ENABLE_SSE4_2) && !defined(IMGUI_USE_LEGACY_CRC32_ADLER) && !defined(__EMSCRIPTEN__) #define IMGUI_ENABLE_SSE4_2_CRC #endif +// Only AVX2 supports integer and byte instructions for 256-bit registers. Implementation this on AVX1 is not possible. +#if defined(IMGUI_ENABLE_AVX2) +#define IMGUI_ENABLE_AVX2_IMMEMCHR +#elif defined(IMGUI_ENABLE_AVX) || defined(IMGUI_ENABLE_SSE) +#define IMGUI_ENABLE_SSE_IMMEMCHR +#endif + // Visual Studio warnings #ifdef _MSC_VER #pragma warning (push) @@ -370,7 +393,7 @@ static inline bool ImIsPowerOfTwo(int v) { return v != 0 && (v & static inline bool ImIsPowerOfTwo(ImU64 v) { return v != 0 && (v & (v - 1)) == 0; } static inline int ImUpperPowerOfTwo(int v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; } -// Helpers: String +IMGUI_API const void* ImMemchr(const void* buf, int val, size_t count); // Find first occurrence of 'val' in buffer given length. IMGUI_API int ImStricmp(const char* str1, const char* str2); // Case insensitive compare. IMGUI_API int ImStrnicmp(const char* str1, const char* str2, size_t count); // Case insensitive compare to a certain count. IMGUI_API void ImStrncpy(char* dst, const char* src, size_t count); // Copy to a certain count and always zero terminate (strncpy doesn't). diff --git a/imgui_widgets.cpp b/imgui_widgets.cpp index 2b95c0dd58f3..98cc789072f2 100644 --- a/imgui_widgets.cpp +++ b/imgui_widgets.cpp @@ -209,7 +209,7 @@ void ImGui::TextEx(const char* text, const char* text_end, ImGuiTextFlags flags) int lines_skipped = 0; while (line < text_end && lines_skipped < lines_skippable) { - const char* line_end = (const char*)memchr(line, '\n', text_end - line); + const char* line_end = (const char*)ImMemchr(line, '\n', text_end - line); if (!line_end) line_end = text_end; if ((flags & ImGuiTextFlags_NoWidthForLargeClippedText) == 0) @@ -230,7 +230,7 @@ void ImGui::TextEx(const char* text, const char* text_end, ImGuiTextFlags flags) if (IsClippedEx(line_rect, 0)) break; - const char* line_end = (const char*)memchr(line, '\n', text_end - line); + const char* line_end = (const char*)ImMemchr(line, '\n', text_end - line); if (!line_end) line_end = text_end; text_size.x = ImMax(text_size.x, CalcTextSize(line, line_end).x); @@ -245,7 +245,7 @@ void ImGui::TextEx(const char* text, const char* text_end, ImGuiTextFlags flags) int lines_skipped = 0; while (line < text_end) { - const char* line_end = (const char*)memchr(line, '\n', text_end - line); + const char* line_end = (const char*)ImMemchr(line, '\n', text_end - line); if (!line_end) line_end = text_end; if ((flags & ImGuiTextFlags_NoWidthForLargeClippedText) == 0) @@ -5200,7 +5200,7 @@ bool ImGui::InputTextEx(const char* label, const char* hint, char* buf, int buf_ int line_count = 1; if (is_multiline) { - for (const char* s = text_begin; (s = (const char*)memchr(s, '\n', (size_t)(text_end - s))) != NULL; s++) + for (const char* s = text_begin; (s = (const char*)ImMemchr(s, '\n', (size_t)(text_end - s))) != NULL; s++) { if (cursor_line_no == -1 && s >= cursor_ptr) { cursor_line_no = line_count; } if (selmin_line_no == -1 && s >= selmin_ptr) { selmin_line_no = line_count; } @@ -5278,7 +5278,7 @@ bool ImGui::InputTextEx(const char* label, const char* hint, char* buf, int buf_ break; if (rect_pos.y < clip_rect.y) { - p = (const char*)memchr((void*)p, '\n', text_selected_end - p); + p = (const char*)ImMemchr((void*)p, '\n', text_selected_end - p); p = p ? p + 1 : text_selected_end; } else