diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 6ef07513ee4b10..cdd9ca6652dd56 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -3144,6 +3144,7 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE clsHnd, case NI_IsSupported_True: { assert(sig->numArgs == 0); + impInlineRoot()->m_inlineStrategy->NoteHardwareIntrinsicCheckObserved(); return gtNewIconNode(true); } @@ -3155,6 +3156,7 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE clsHnd, case NI_IsSupported_Dynamic: { + impInlineRoot()->m_inlineStrategy->NoteHardwareIntrinsicCheckObserved(); break; } @@ -3163,6 +3165,8 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE clsHnd, CORINFO_CLASS_HANDLE typeArgHnd; CorInfoType simdBaseJitType; + impInlineRoot()->m_inlineStrategy->NoteHardwareIntrinsicCheckObserved(); + typeArgHnd = info.compCompHnd->getTypeInstantiationArgument(clsHnd, 0); simdBaseJitType = info.compCompHnd->getTypeForPrimitiveNumericClass(typeArgHnd); diff --git a/src/coreclr/jit/inline.cpp b/src/coreclr/jit/inline.cpp index 3a6cc6b0b4d33a..cf04918ec07ab9 100644 --- a/src/coreclr/jit/inline.cpp +++ b/src/coreclr/jit/inline.cpp @@ -848,6 +848,7 @@ InlineStrategy::InlineStrategy(Compiler* compiler) , m_InitialSizeEstimate(0) , m_CurrentSizeEstimate(0) , m_HasForceViaDiscretionary(false) + , m_HasHardwareIntrinsicCheck(false) #if defined(DEBUG) , m_MethodXmlFilePosition(0) , m_Random(nullptr) @@ -1255,6 +1256,46 @@ bool InlineStrategy::BudgetCheck(unsigned ilSize) return result; } +//------------------------------------------------------------------------ +// NoteHardwareIntrinsicCheckObserved: record that the root method or an +// already-imported inlinee references a HW-intrinsic IsSupported / +// IsHardwareAccelerated capability check, and grow the inline time +// budget on the first such observation per root method. +// +// Notes: +// Methods with SIMD paths typically carry several ISA-specific fallbacks +// (e.g. Vector512/Vector256/Vector128/scalar variants), making them +// IL-heavy. Inlining one such callee can otherwise consume nearly the +// entire inline time budget for the root method, blocking subsequent +// inlines of trivial helpers (Span.Slice, property getters, etc.). +// +// The boost is one-shot per root method and monotonic: it never lowers +// the current budget (preserving any prior growth from force inlines). +// +void InlineStrategy::NoteHardwareIntrinsicCheckObserved() +{ + if (m_HasHardwareIntrinsicCheck) + { + return; + } + + m_HasHardwareIntrinsicCheck = true; + + // Compute the boosted budget in 64-bit to avoid signed overflow when + // an unusually large JitInlineBudget is configured. + const int64_t boosted64 = + static_cast(m_InitialTimeBudget) * static_cast(SIMD_BUDGET_BOOST_MULTIPLIER); + const int boosted = (boosted64 > INT_MAX) ? INT_MAX : static_cast(boosted64); + + if (m_CurrentTimeBudget < boosted) + { + JITDUMP("\nBudget: HW intrinsic IsSupported/IsHardwareAccelerated check observed; " + "boosting inline time budget from %d to %d (initial=%d, multiplier=%d)\n", + m_CurrentTimeBudget, boosted, m_InitialTimeBudget, (int)SIMD_BUDGET_BOOST_MULTIPLIER); + m_CurrentTimeBudget = boosted; + } +} + //------------------------------------------------------------------------ // NewRoot: construct an InlineContext for the root method // diff --git a/src/coreclr/jit/inline.h b/src/coreclr/jit/inline.h index 83d74587789366..4ae4670f800c96 100644 --- a/src/coreclr/jit/inline.h +++ b/src/coreclr/jit/inline.h @@ -989,7 +989,15 @@ class InlineStrategy // Maximum number of over-budget [Intrinsic]-type inlines allowed per root method. enum { - MAX_OVER_BUDGET_INTRINSIC_INLINES = 50 + MAX_OVER_BUDGET_INTRINSIC_INLINES = 50, + + // When the root method or an already-imported inlinee references a + // Vector*/HW-intrinsic IsSupported / IsHardwareAccelerated property, + // multiply the initial inline time budget by this factor (one-shot). + // Methods with SIMD ISA fallbacks tend to be IL-heavy, and inlining one + // such callee can otherwise consume the budget for trivial helpers + // (e.g., Span.Slice, property getters) that follow. + SIMD_BUDGET_BOOST_MULTIPLIER = 5 }; // Number of over-budget inlines admitted because the callee was on an [Intrinsic] type. @@ -1004,6 +1012,18 @@ class InlineStrategy m_OverBudgetIntrinsicInlineCount++; } + // Note that the root method or an already-imported inlinee uses a HW + // intrinsic IsSupported / IsHardwareAccelerated capability check (e.g., + // Vector128.IsHardwareAccelerated, Vector.IsSupported, Sse41.IsSupported). + // On the first such observation per root method this dramatically increases + // the inline time budget so that subsequent small inlinees are not starved. + void NoteHardwareIntrinsicCheckObserved(); + + bool HasObservedHardwareIntrinsicCheck() const + { + return m_HasHardwareIntrinsicCheck; + } + // Number of successful inlines into the root unsigned GetInlineCount() const { @@ -1165,6 +1185,7 @@ class InlineStrategy int m_InitialSizeEstimate; int m_CurrentSizeEstimate; bool m_HasForceViaDiscretionary; + bool m_HasHardwareIntrinsicCheck; #if defined(DEBUG) long m_MethodXmlFilePosition; diff --git a/src/libraries/System.Linq/src/System/Linq/MaxMin.cs b/src/libraries/System.Linq/src/System/Linq/MaxMin.cs index 7adf15a83dcd3e..26e1d6eec512e4 100644 --- a/src/libraries/System.Linq/src/System/Linq/MaxMin.cs +++ b/src/libraries/System.Linq/src/System/Linq/MaxMin.cs @@ -3,9 +3,8 @@ using System.Collections.Generic; using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; using System.Runtime.Intrinsics; +using System.Runtime.CompilerServices; namespace System.Linq { @@ -47,77 +46,68 @@ private static T MinMaxInteger(this IEnumerable source) value = span[i]; } } + return value; } - else if (!Vector256.IsHardwareAccelerated || !Vector256.IsSupported || span.Length < Vector256.Count) - { - ref T current = ref MemoryMarshal.GetReference(span); - ref T lastVectorStart = ref Unsafe.Add(ref current, span.Length - Vector128.Count); - Vector128 best = Vector128.LoadUnsafe(ref current); - current = ref Unsafe.Add(ref current, Vector128.Count); + // All vectorized paths reduce to 128-bit, so we can use that as our accumulator + // regardless of the maximum supported vector size. + Vector128 best128; - while (Unsafe.IsAddressLessThan(ref current, ref lastVectorStart)) - { - best = TMinMax.Compare(best, Vector128.LoadUnsafe(ref current)); - current = ref Unsafe.Add(ref current, Vector128.Count); - } - best = TMinMax.Compare(best, Vector128.LoadUnsafe(ref lastVectorStart)); + if (!Vector256.IsHardwareAccelerated || span.Length < Vector256.Count) + { + ReadOnlySpan data = span; + Vector128 best = Vector128.Create(data); + data = data.Slice(Vector128.Count); - value = best[0]; - for (int i = 1; i < Vector128.Count; i++) + while (data.Length > Vector128.Count) { - if (TMinMax.Compare(best[i], value)) - { - value = best[i]; - } + best = TMinMax.Compare(best, Vector128.Create(data)); + data = data.Slice(Vector128.Count); } + best128 = TMinMax.Compare(best, Vector128.Create(span.Slice(span.Length - Vector128.Count))); } - else if (!Vector512.IsHardwareAccelerated || !Vector512.IsSupported || span.Length < Vector512.Count) + else if (!Vector512.IsHardwareAccelerated || span.Length < Vector512.Count) { - ref T current = ref MemoryMarshal.GetReference(span); - ref T lastVectorStart = ref Unsafe.Add(ref current, span.Length - Vector256.Count); + ReadOnlySpan data = span; + Vector256 best = Vector256.Create(data); + data = data.Slice(Vector256.Count); - Vector256 best = Vector256.LoadUnsafe(ref current); - current = ref Unsafe.Add(ref current, Vector256.Count); - - while (Unsafe.IsAddressLessThan(ref current, ref lastVectorStart)) + while (data.Length > Vector256.Count) { - best = TMinMax.Compare(best, Vector256.LoadUnsafe(ref current)); - current = ref Unsafe.Add(ref current, Vector256.Count); + best = TMinMax.Compare(best, Vector256.Create(data)); + data = data.Slice(Vector256.Count); } - best = TMinMax.Compare(best, Vector256.LoadUnsafe(ref lastVectorStart)); + best = TMinMax.Compare(best, Vector256.Create(span.Slice(span.Length - Vector256.Count))); - value = best[0]; - for (int i = 1; i < Vector256.Count; i++) - { - if (TMinMax.Compare(best[i], value)) - { - value = best[i]; - } - } + // Reduce to 128-bit + best128 = TMinMax.Compare(best.GetLower(), best.GetUpper()); } else { - ref T current = ref MemoryMarshal.GetReference(span); - ref T lastVectorStart = ref Unsafe.Add(ref current, span.Length - Vector512.Count); + ReadOnlySpan data = span; + Vector512 best = Vector512.Create(data); + data = data.Slice(Vector512.Count); - Vector512 best = Vector512.LoadUnsafe(ref current); - current = ref Unsafe.Add(ref current, Vector512.Count); - - while (Unsafe.IsAddressLessThan(ref current, ref lastVectorStart)) + while (data.Length > Vector512.Count) { - best = TMinMax.Compare(best, Vector512.LoadUnsafe(ref current)); - current = ref Unsafe.Add(ref current, Vector512.Count); + best = TMinMax.Compare(best, Vector512.Create(data)); + data = data.Slice(Vector512.Count); } - best = TMinMax.Compare(best, Vector512.LoadUnsafe(ref lastVectorStart)); + best = TMinMax.Compare(best, Vector512.Create(span.Slice(span.Length - Vector512.Count))); - value = best[0]; - for (int i = 1; i < Vector512.Count; i++) + // Reduce to 128-bit + Vector256 best256 = TMinMax.Compare(best.GetLower(), best.GetUpper()); + best128 = TMinMax.Compare(best256.GetLower(), best256.GetUpper()); + } + + // Reduce to single value + // NOTE: this can be optimized further with shuffles. + value = best128[0]; + for (int i = 1; i < Vector128.Count; i++) + { + if (TMinMax.Compare(best128[i], value)) { - if (TMinMax.Compare(best[i], value)) - { - value = best[i]; - } + value = best128[i]; } } }