Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

55% memchr optimization with SIMD on x86-64 | Macros config SIMD #8421

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 105 additions & 4 deletions imgui.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1946,6 +1946,107 @@ ImVec2 ImTriangleClosestPoint(const ImVec2& a, const ImVec2& b, const ImVec2& c,
// [SECTION] MISC HELPERS/UTILITIES (String, Format, Hash functions)
//-----------------------------------------------------------------------------

#if defined IMGUI_ENABLE_AVX2_IMMEMCHR
const void* ImMemchr(const void* buf, int val, size_t count)
{
const size_t SIMD_LENGTH = 32;
const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1;

const unsigned char* ptr = (const unsigned char*)buf;
const unsigned char* end = ptr + count;
const unsigned char* align_end = end - SIMD_LENGTH;
const unsigned char ch = (const unsigned char)val;

if (ptr <= align_end)
{
const __m256i target = _mm256_set1_epi8(ch);

if ((uintptr_t)ptr & SIMD_LENGTH_MASK)
{
__m256i chunk = _mm256_lddqu_si256((const __m256i*)ptr);
int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target));

if (mask)
return (const void*)(ptr + _tzcnt_u32(mask));

ptr = (const unsigned char*)_andn_u64(SIMD_LENGTH_MASK, (uintptr_t)ptr + SIMD_LENGTH_MASK);
}

for (; ptr <= align_end; ptr += SIMD_LENGTH)
{
__m256i chunk = _mm256_load_si256((const __m256i*)ptr);
int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, target));

if (mask)
return (const void*)(ptr + _tzcnt_u32(mask));

if (ptr <= end - 1024)
_mm_prefetch((const char*)(ptr + 1024), _MM_HINT_T0);
}
}

for (; ptr < end; ptr++)
{
if (*ptr == ch)
return (const void*)(ptr);
}

return nullptr;
}
#elif defined IMGUI_ENABLE_SSE_IMMEMCHR
const void* ImMemchr(const void* buf, int val, size_t count)
{
const size_t SIMD_LENGTH = 16;
const size_t SIMD_LENGTH_MASK = SIMD_LENGTH - 1;

const unsigned char* ptr = (const unsigned char*)buf;
const unsigned char* end = ptr + count;
const unsigned char* align_end = end - SIMD_LENGTH;
const unsigned char ch = (const unsigned char)val;

if (ptr <= align_end)
{
const __m128i target = _mm_set1_epi8(ch);

if ((uintptr_t)ptr & SIMD_LENGTH_MASK)
{
__m128i chunk = _mm_lddqu_si128((const __m128i*)ptr);
int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target));

if (mask)
return (const void*)(ptr + _tzcnt_u32(mask));

ptr = (const unsigned char*)(((uintptr_t)ptr + SIMD_LENGTH_MASK) & ~SIMD_LENGTH_MASK);
}

for (; ptr <= align_end; ptr += SIMD_LENGTH)
{
__m128i chunk = _mm_load_si128((const __m128i*)ptr);
int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, target));

if (mask)
return (const void*)(ptr + _tzcnt_u32(mask));

if (ptr <= end - 1024)
_mm_prefetch((const char*)(ptr + 1024), _MM_HINT_T0);
}
}

for (; ptr < end; ptr++)
{
if (*ptr == ch)
return (const void*)(ptr);
}

return nullptr;
}
#else
const void* ImMemchr(const void* buf, int val, size_t count)
{
return memchr(buf, val, count);
}
#endif

// Consider using _stricmp/_strnicmp under Windows or strcasecmp/strncasecmp. We don't actually use either ImStricmp/ImStrnicmp in the codebase any more.
int ImStricmp(const char* str1, const char* str2)
{
Expand Down Expand Up @@ -1993,7 +2094,7 @@ char* ImStrdupcpy(char* dst, size_t* p_dst_size, const char* src)

const char* ImStrchrRange(const char* str, const char* str_end, char c)
{
const char* p = (const char*)memchr(str, (int)c, str_end - str);
const char* p = (const char*)ImMemchr(str, (int)c, str_end - str);
return p;
}

Expand All @@ -2008,7 +2109,7 @@ int ImStrlenW(const ImWchar* str)
// Find end-of-line. Return pointer will point to either first \n, either str_end.
const char* ImStreolRange(const char* str, const char* str_end)
{
const char* p = (const char*)memchr(str, '\n', str_end - str);
const char* p = (const char*)ImMemchr(str, '\n', str_end - str);
return p ? p : str_end;
}

Expand Down Expand Up @@ -2557,7 +2658,7 @@ int ImTextCountLines(const char* in_text, const char* in_text_end)
int count = 0;
while (in_text < in_text_end)
{
const char* line_end = (const char*)memchr(in_text, '\n', in_text_end - in_text);
const char* line_end = (const char*)ImMemchr(in_text, '\n', in_text_end - in_text);
in_text = line_end ? line_end + 1 : in_text_end;
count++;
}
Expand Down Expand Up @@ -2965,7 +3066,7 @@ void ImGuiTextIndex::append(const char* base, int old_size, int new_size)
if (EndOffset == 0 || base[EndOffset - 1] == '\n')
LineOffsets.push_back(EndOffset);
const char* base_end = base + new_size;
for (const char* p = base + old_size; (p = (const char*)memchr(p, '\n', base_end - p)) != 0; )
for (const char* p = base + old_size; (p = (const char*)ImMemchr(p, '\n', base_end - p)) != 0; )
if (++p < base_end) // Don't push a trailing offset on last \n
LineOffsets.push_back((int)(intptr_t)(p - base));
EndOffset = ImMax(EndOffset, new_size);
Expand Down
4 changes: 2 additions & 2 deletions imgui_draw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4141,7 +4141,7 @@ void ImFont::RenderText(ImDrawList* draw_list, float size, const ImVec2& pos, Im
if (y + line_height < clip_rect.y)
while (y + line_height < clip_rect.y && s < text_end)
{
const char* line_end = (const char*)memchr(s, '\n', text_end - s);
const char* line_end = (const char*)ImMemchr(s, '\n', text_end - s);
if (word_wrap_enabled)
{
// FIXME-OPT: This is not optimal as do first do a search for \n before calling CalcWordWrapPositionA().
Expand All @@ -4165,7 +4165,7 @@ void ImFont::RenderText(ImDrawList* draw_list, float size, const ImVec2& pos, Im
float y_end = y;
while (y_end < clip_rect.w && s_end < text_end)
{
s_end = (const char*)memchr(s_end, '\n', text_end - s_end);
s_end = (const char*)ImMemchr(s_end, '\n', text_end - s_end);
s_end = s_end ? s_end + 1 : text_end;
y_end += line_height;
}
Expand Down
35 changes: 29 additions & 6 deletions imgui_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,43 @@ Index of this file:
#include <math.h> // sqrtf, fabsf, fmodf, powf, floorf, ceilf, cosf, sinf
#include <limits.h> // INT_MIN, INT_MAX

// Enable SSE intrinsics if available
#if (defined __SSE__ || defined __x86_64__ || defined _M_X64 || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))) && !defined(IMGUI_DISABLE_SSE)
// Include compiler-specific intrinsics header
#if !defined(IMGUI_DISABLE_SIMD)
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__) || defined(__clang__)
#include <x86intrin.h>
#endif
#endif

// Enable SIMD x86-64 intrinsics if available
#if (defined __x86_64__ || defined _M_X64) && !defined(IMGUI_DISABLE_SIMD)
#if (defined __SSE__ || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))) && !defined(IMGUI_DISABLE_SSE)
#define IMGUI_ENABLE_SSE
#include <immintrin.h>
#if (defined __AVX__ || defined __SSE4_2__)
#endif
#if defined (__SSE4_2__) && !defined(IMGUI_DISABLE_SSE4_2)
#define IMGUI_ENABLE_SSE4_2
#include <nmmintrin.h>
#endif
#if (defined __AVX__) && !defined(IMGUI_DISABLE_AVX)
#define IMGUI_ENABLE_AVX
#endif
#if (defined __AVX2__) && !defined(IMGUI_DISABLE_AVX2)
#define IMGUI_ENABLE_AVX2
#endif
#endif

// Emscripten has partial SSE 4.2 support where _mm_crc32_u32 is not available. See https://emscripten.org/docs/porting/simd.html#id11 and #8213
#if defined(IMGUI_ENABLE_SSE4_2) && !defined(IMGUI_USE_LEGACY_CRC32_ADLER) && !defined(__EMSCRIPTEN__)
#define IMGUI_ENABLE_SSE4_2_CRC
#endif

// Only AVX2 supports integer and byte instructions for 256-bit registers. Implementation this on AVX1 is not possible.
#if defined(IMGUI_ENABLE_AVX2)
#define IMGUI_ENABLE_AVX2_IMMEMCHR
#elif defined(IMGUI_ENABLE_AVX) || defined(IMGUI_ENABLE_SSE)
#define IMGUI_ENABLE_SSE_IMMEMCHR
#endif

// Visual Studio warnings
#ifdef _MSC_VER
#pragma warning (push)
Expand Down Expand Up @@ -370,7 +393,7 @@ static inline bool ImIsPowerOfTwo(int v) { return v != 0 && (v &
static inline bool ImIsPowerOfTwo(ImU64 v) { return v != 0 && (v & (v - 1)) == 0; }
static inline int ImUpperPowerOfTwo(int v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; }

// Helpers: String
IMGUI_API const void* ImMemchr(const void* buf, int val, size_t count); // Find first occurrence of 'val' in buffer given length.
IMGUI_API int ImStricmp(const char* str1, const char* str2); // Case insensitive compare.
IMGUI_API int ImStrnicmp(const char* str1, const char* str2, size_t count); // Case insensitive compare to a certain count.
IMGUI_API void ImStrncpy(char* dst, const char* src, size_t count); // Copy to a certain count and always zero terminate (strncpy doesn't).
Expand Down
10 changes: 5 additions & 5 deletions imgui_widgets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ void ImGui::TextEx(const char* text, const char* text_end, ImGuiTextFlags flags)
int lines_skipped = 0;
while (line < text_end && lines_skipped < lines_skippable)
{
const char* line_end = (const char*)memchr(line, '\n', text_end - line);
const char* line_end = (const char*)ImMemchr(line, '\n', text_end - line);
if (!line_end)
line_end = text_end;
if ((flags & ImGuiTextFlags_NoWidthForLargeClippedText) == 0)
Expand All @@ -230,7 +230,7 @@ void ImGui::TextEx(const char* text, const char* text_end, ImGuiTextFlags flags)
if (IsClippedEx(line_rect, 0))
break;

const char* line_end = (const char*)memchr(line, '\n', text_end - line);
const char* line_end = (const char*)ImMemchr(line, '\n', text_end - line);
if (!line_end)
line_end = text_end;
text_size.x = ImMax(text_size.x, CalcTextSize(line, line_end).x);
Expand All @@ -245,7 +245,7 @@ void ImGui::TextEx(const char* text, const char* text_end, ImGuiTextFlags flags)
int lines_skipped = 0;
while (line < text_end)
{
const char* line_end = (const char*)memchr(line, '\n', text_end - line);
const char* line_end = (const char*)ImMemchr(line, '\n', text_end - line);
if (!line_end)
line_end = text_end;
if ((flags & ImGuiTextFlags_NoWidthForLargeClippedText) == 0)
Expand Down Expand Up @@ -5200,7 +5200,7 @@ bool ImGui::InputTextEx(const char* label, const char* hint, char* buf, int buf_
int line_count = 1;
if (is_multiline)
{
for (const char* s = text_begin; (s = (const char*)memchr(s, '\n', (size_t)(text_end - s))) != NULL; s++)
for (const char* s = text_begin; (s = (const char*)ImMemchr(s, '\n', (size_t)(text_end - s))) != NULL; s++)
{
if (cursor_line_no == -1 && s >= cursor_ptr) { cursor_line_no = line_count; }
if (selmin_line_no == -1 && s >= selmin_ptr) { selmin_line_no = line_count; }
Expand Down Expand Up @@ -5278,7 +5278,7 @@ bool ImGui::InputTextEx(const char* label, const char* hint, char* buf, int buf_
break;
if (rect_pos.y < clip_rect.y)
{
p = (const char*)memchr((void*)p, '\n', text_selected_end - p);
p = (const char*)ImMemchr((void*)p, '\n', text_selected_end - p);
p = p ? p + 1 : text_selected_end;
}
else
Expand Down