diff --git a/src/coreclr/debug/daccess/daccess.cpp b/src/coreclr/debug/daccess/daccess.cpp index 2ff45fd348e30a..047f0f84dc0f5d 100644 --- a/src/coreclr/debug/daccess/daccess.cpp +++ b/src/coreclr/debug/daccess/daccess.cpp @@ -1544,24 +1544,24 @@ DAC_INSTANCE* DacInstanceManager::Add(DAC_INSTANCE* inst) { _ASSERTE(inst != NULL); -#ifdef _DEBUG - bool isInserted = (m_hash.find(inst->addr) == m_hash.end()); -#endif //_DEBUG - DAC_INSTANCE *(&target) = m_hash[inst->addr]; - _ASSERTE(!isInserted || target == NULL); - if( target != NULL ) + const KeyValuePair* pEntry = m_hash.LookupPtr(inst->addr); + if (pEntry != NULL) { + DAC_INSTANCE* existing = pEntry->Value(); + //This is necessary to preserve the semantics of Supersede, however, it //is more or less dead code. - inst->next = target; - target = inst; + inst->next = existing; //verify descending order - _ASSERTE(inst->size >= target->size); + _ASSERTE(inst->size >= existing->size); + + m_hash.ReplacePtr(pEntry, KeyValuePair(inst->addr, inst)); } else { - target = inst; + inst->next = NULL; + m_hash.Add(KeyValuePair(inst->addr, inst)); } return inst; @@ -1803,15 +1803,13 @@ DacInstanceManager::Find(TADDR addr) DAC_INSTANCE* DacInstanceManager::Find(TADDR addr) { - DacInstanceHashIterator iter = m_hash.find(addr); - if( iter == m_hash.end() ) - { - return NULL; - } - else + const KeyValuePair* pEntry = m_hash.LookupPtr(addr); + if (pEntry != NULL) { - return iter->second; + return pEntry->Value(); } + + return NULL; } #endif // if defined(DAC_HASHTABLE) @@ -1887,12 +1885,11 @@ DacInstanceManager::Supersede(DAC_INSTANCE* inst) // later cleanup. // - DacInstanceHashIterator iter = m_hash.find(inst->addr); - if( iter == m_hash.end() ) + const KeyValuePair* pEntry = m_hash.LookupPtr(inst->addr); + if (pEntry == NULL) return; - DAC_INSTANCE** bucket = &(iter->second); - DAC_INSTANCE* cur = *bucket; + DAC_INSTANCE* cur = pEntry->Value(); DAC_INSTANCE* prev = NULL; //walk through the chain looking for this particular instance while (cur) @@ -1901,7 +1898,15 @@ DacInstanceManager::Supersede(DAC_INSTANCE* inst) { if (!prev) { - *bucket = inst->next; + // Removing the head of the chain. + if (inst->next != NULL) + { + m_hash.ReplacePtr(pEntry, KeyValuePair(inst->addr, inst->next)); + } + else + { + m_hash.Remove(inst->addr); + } } else { @@ -1982,7 +1987,7 @@ void DacInstanceManager::Flush(bool fSaveBlock) } } #else //DAC_HASHTABLE - m_hash.clear(); + m_hash.RemoveAll(); #endif //DAC_HASHTABLE InitEmpty(); @@ -2021,18 +2026,17 @@ DacInstanceManager::ClearEnumMemMarker(void) void DacInstanceManager::ClearEnumMemMarker(void) { - ULONG i; DAC_INSTANCE* inst; - DacInstanceHashIterator end = m_hash.end(); + DacInstanceHashIterator end = m_hash.End(); /* REVISIT_TODO Fri 10/20/2006 * This might have an issue, since it might miss chained entries off of * ->next. However, ->next is going away, and for all intents and * purposes, this never happens. */ - for( DacInstanceHashIterator cur = m_hash.begin(); cur != end; ++cur ) + for (DacInstanceHashIterator cur = m_hash.Begin(); cur != end; ++cur) { - cur->second->enumMem = 0; + cur->Value()->enumMem = 0; } for (inst = m_superseded; inst; inst = inst->next) @@ -2144,10 +2148,10 @@ DacInstanceManager::DumpAllInstances( int numInBucket = 0; #endif // #if defined(DAC_MEASURE_PERF) - DacInstanceHashIterator end = m_hash.end(); - for (DacInstanceHashIterator cur = m_hash.begin(); end != cur; ++cur) + DacInstanceHashIterator end = m_hash.End(); + for (DacInstanceHashIterator cur = m_hash.Begin(); end != cur; ++cur) { - inst = cur->second; + inst = cur->Value(); // Only report those we intended to. // So far, only metadata is excluded! @@ -3270,6 +3274,9 @@ ClrDataAccess::Flush(void) // m_mdImports.Flush(); + // Free cached patch entries for thread unwinding + m_patchCache.Flush(); + // Free instance memory. m_instances.Flush(); diff --git a/src/coreclr/debug/daccess/dacfn.cpp b/src/coreclr/debug/daccess/dacfn.cpp index c3131e2d571efe..e2968139fa6afa 100644 --- a/src/coreclr/debug/daccess/dacfn.cpp +++ b/src/coreclr/debug/daccess/dacfn.cpp @@ -1505,43 +1505,68 @@ HRESULT DacReplacePatchesInHostMemory(MemoryRange range, PVOID pBuffer) return S_OK; } + if (g_dacImpl == nullptr) + { + return E_UNEXPECTED; + } + + // Cache the patches as the target is not running during DAC operations, and hash + // table iteration is pretty slow. + const SArray& entries = g_dacImpl->m_patchCache.GetEntries(); + + CORDB_ADDRESS address = (CORDB_ADDRESS)(dac_cast(range.StartAddress())); + SIZE_T cbSize = range.Size(); + + for (COUNT_T i = 0; i < entries.GetCount(); i++) + { + const DacPatchCacheEntry& entry = entries[i]; + + if (IsPatchInRequestedRange(address, cbSize, entry.address)) + { + CORDbgSetInstructionEx(reinterpret_cast(pBuffer), address, entry.address, entry.opcode, cbSize); + } + } + + return S_OK; +} + +const SArray& DacPatchCache::GetEntries() +{ + Populate(); + return m_entries; +} + +void DacPatchCache::Populate() +{ + SUPPORTS_DAC; + + if (m_isPopulated) + { + return; + } + + // Clear any stale entries from previous failed population attempts + m_entries.Clear(); + HASHFIND info; DebuggerPatchTable * pTable = DebuggerController::GetPatchTable(); DebuggerControllerPatch * pPatch = pTable->GetFirstPatch(&info); - // - // The unwinder needs to read the stack very often to restore pushed registers, retrieve the - // return addres, etc. However, stack addresses should never be patched. - // One way to optimize this code is to pass the stack base and the stack limit of the thread to this - // function and use those two values to filter out stack addresses. - // - // Another thing we can do is instead of enumerating the patches, we could enumerate the address. - // This is more efficient when we have a large number of patches and a small memory range. Perhaps - // we could do a hybrid approach, i.e. use the size of the range and the number of patches to dynamically - // determine which enumeration is more efficient. - // while (pPatch != NULL) { CORDB_ADDRESS patchAddress = (CORDB_ADDRESS)dac_cast(pPatch->address); if (patchAddress != NULL) { - PRD_TYPE opcode = pPatch->opcode; - - CORDB_ADDRESS address = (CORDB_ADDRESS)(dac_cast(range.StartAddress())); - SIZE_T cbSize = range.Size(); - - // Check if the address of the patch is in the specified memory range. - if (IsPatchInRequestedRange(address, cbSize, patchAddress)) - { - // Replace the patch in the buffer with the original opcode. - CORDbgSetInstructionEx(reinterpret_cast(pBuffer), address, patchAddress, opcode, cbSize); - } + DacPatchCacheEntry entry; + entry.address = patchAddress; + entry.opcode = pPatch->opcode; + m_entries.Append(entry); } pPatch = pTable->GetNextPatch(&info); } - return S_OK; + m_isPopulated = true; } diff --git a/src/coreclr/debug/daccess/dacimpl.h b/src/coreclr/debug/daccess/dacimpl.h index 8de684a5dae9a2..88d729f4d46e49 100644 --- a/src/coreclr/debug/daccess/dacimpl.h +++ b/src/coreclr/debug/daccess/dacimpl.h @@ -16,16 +16,8 @@ #include "gcinterface.dac.h" //--------------------------------------------------------------------------------------- // Setting DAC_HASHTABLE tells the DAC to use the hand rolled hashtable for -// storing code:DAC_INSTANCE . Otherwise, the DAC uses the STL unordered_map to. +// storing code:DAC_INSTANCE . Otherwise, the DAC uses SHash. -#define DAC_HASHTABLE - -#ifndef DAC_HASHTABLE -#pragma push_macro("return") -#undef return -#include -#pragma pop_macro("return") -#endif //DAC_HASHTABLE extern CRITICAL_SECTION g_dacCritSec; // Convert between CLRDATA_ADDRESS and TADDR. @@ -732,54 +724,28 @@ class DacInstanceManager HashInstanceKeyBlock* m_hash[DAC_INSTANCE_HASH_SIZE]; #else //DAC_HASHTABLE - // We're going to use the STL unordered_map for our instance hash. - // This has the benefit of scaling to different workloads appropriately (as opposed to having a - // fixed number of buckets). - - class DacHashCompare : public std::hash_compare + // SHash-based hash table for DAC instances, keyed by target address. + // The custom hash function avoids pseudo-randomizing in favor of a simple + // shift that clusters nearby addresses together. This improves access + // locality and has a significant positive impact on minidump library + // performance when enumerating entries during dump generation (as dbghelp + // has to coalesce adjacent memory regions). + class DacInstanceSHashTraits : public NonDacAwareSHashTraits> { public: - // Custom hash function - // The default hash function uses a pseudo-randomizing function to get a random - // distribution. In our case, we'd actually like a more even distribution to get - // better access locality (see comments for DAC_INSTANCE_HASH_BITS). - // - // Also, when enumerating the hash during dump generation, clustering nearby addresses - // together can have a significant positive impact on the performance of the minidump - // library (at least the un-optimized version 6.5 linked into our dw20.exe - on Vista+ - // we use the OS's version 6.7+ with radically improved perf characteristics). Having - // a random distribution is actually the worst-case because it means most blocks won't - // be merged until near the end, and a large number of intermediate blocks will have to - // be searched with each call. - // - // The default pseudo-randomizing function also requires a call to ldiv which shows up as - // a 3%-5% perf hit in most perf-sensitive scenarios, so this should also always be - // faster. - inline size_t operator()(const TADDR& keyval) const + typedef MapSHashTraits PARENT; + typedef PARENT::element_t element_t; + typedef PARENT::count_t count_t; + typedef PARENT::key_t key_t; + + static count_t Hash(key_t k) { - return (unsigned)(keyval >>DAC_INSTANCE_HASH_SHIFT); + return (count_t)(k >> DAC_INSTANCE_HASH_SHIFT); } - - // Explicitly bring in the two-argument comparison function from the base class (just less-than) - // This is necessary because once we override one form of operator() above, we don't automatically - // get the others by C++ inheritance rules. - using std::hash_compare::operator(); - -#ifdef NIDUMP_CUSTOMIZED_DAC_HASH // not set - //this particular number is supposed to be roughly the same amount of - //memory as the old code (buckets * number of entries in the old - //blocks.) - //disabled for now. May tweak implementation later. It turns out that - //having a large number of initial buckets is excellent for nidump, but it - // is terrible for most other scenarios due to the cost of clearing them at - // every Flush. Once there is a better perf suite, we can tweak these values more. - static const size_t min_buckets = DAC_INSTANCE_HASH_SIZE * 256; -#endif - }; - typedef std::unordered_map DacInstanceHash; - typedef DacInstanceHash::value_type DacInstanceHashValue; - typedef DacInstanceHash::iterator DacInstanceHashIterator; + + typedef SHash DacInstanceHash; + typedef DacInstanceHash::Iterator DacInstanceHashIterator; DacInstanceHash m_hash; #endif //DAC_HASHTABLE @@ -795,6 +761,48 @@ class DacStreamManager; #endif // FEATURE_MINIMETADATA_IN_TRIAGEDUMPS +//---------------------------------------------------------------------------- +// +// DacPatchCache - Caches debugger breakpoint patches from the target process. +// +// DacReplacePatchesInHostMemory needs to iterate the target's patch hash table +// to replace breakpoint instructions with original opcodes. This iteration is +// expensive because it walks all hash buckets and entries and it will hit either a +// hash lookup in the instance cache or a cache miss + remote read. Since the target +// is not running during DAC operations, the patches should not change while we are performing +// memory enumeration, but if they do we'll miss them until the next time Flush() gets called. +// +//---------------------------------------------------------------------------- + +struct DacPatchCacheEntry +{ + CORDB_ADDRESS address; + PRD_TYPE opcode; +}; + +class DacPatchCache +{ +public: + DacPatchCache() + : m_isPopulated(false) + { + } + + const SArray& GetEntries(); + + void Flush() + { + m_entries.Clear(); + m_isPopulated = false; + } + +private: + void Populate(); + + SArray m_entries; + bool m_isPopulated; +}; + //---------------------------------------------------------------------------- // // ClrDataAccess. @@ -1408,6 +1416,8 @@ class ClrDataAccess ULONG32 m_instanceAge; bool m_debugMode; + DacPatchCache m_patchCache; + #ifdef FEATURE_MINIMETADATA_IN_TRIAGEDUMPS protected: diff --git a/src/coreclr/debug/daccess/enummem.cpp b/src/coreclr/debug/daccess/enummem.cpp index 7856a1ff0c096e..f7cc76c63966d1 100644 --- a/src/coreclr/debug/daccess/enummem.cpp +++ b/src/coreclr/debug/daccess/enummem.cpp @@ -2000,6 +2000,15 @@ ClrDataAccess::EnumMemoryRegions(IN ICLRDataEnumMemoryRegionsCallback* callback, { DacLogMessage("EnumMemoryRegions(CLRDATA_ENUM_MEM_HEAP2)\n"); } + else if (g_EnableFastHeapDumps != 0) + { + // When the target process had DOTNET_EnableFastHeapDumps set, + // use HEAP2 which dumps loader heap pages in bulk instead of + // relying only on per-structure heap enumeration. This is + // significantly faster for large heap dump scenarios. + flags = CLRDATA_ENUM_MEM_HEAP2; + DacLogMessage("EnumMemoryRegions(CLRDATA_ENUM_MEM_HEAP promoted to HEAP2 via DOTNET_EnableFastHeapDumps)\n"); + } else { flags = CLRDATA_ENUM_MEM_HEAP; diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index 13962a20609056..8f981278dcdc83 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -232,6 +232,7 @@ CONFIG_DWORD_INFO(UNSUPPORTED_Debugging_RequiredVersion, W("UNSUPPORTED_Debuggin #ifdef FEATURE_MINIMETADATA_IN_TRIAGEDUMPS RETAIL_CONFIG_DWORD_INFO(INTERNAL_MiniMdBufferCapacity, W("MiniMdBufferCapacity"), 64 * 1024, "The max size of the buffer to store mini metadata information for triage- and mini-dumps.") #endif // FEATURE_MINIMETADATA_IN_TRIAGEDUMPS +RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableFastHeapDumps, W("EnableFastHeapDumps"), 0, "If non-zero, the DAC will use the new mode of collecting heap dumps through loader heap enumeration.") CONFIG_DWORD_INFO(INTERNAL_DbgNativeCodeBpBindsAcrossVersions, W("DbgNativeCodeBpBindsAcrossVersions"), 0, "If non-zero causes native breakpoints at offset 0 to bind in all tiered compilation versions of the given method") RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_RichDebugInfo, W("RichDebugInfo"), 0, "If non-zero store some additional debug information for each jitted method") diff --git a/src/coreclr/inc/dacvars.h b/src/coreclr/inc/dacvars.h index 030c18f0b9db20..6b479d7e13c10b 100644 --- a/src/coreclr/inc/dacvars.h +++ b/src/coreclr/inc/dacvars.h @@ -180,6 +180,7 @@ DEFINE_DACVAR(bool, dac__g_fProcessDetach, ::g_fProcessDetach) DEFINE_DACVAR(DWORD, dac__g_fEEShutDown, ::g_fEEShutDown) DEFINE_DACVAR(ULONG, dac__g_CORDebuggerControlFlags, ::g_CORDebuggerControlFlags) +DEFINE_DACVAR(DWORD, dac__g_EnableFastHeapDumps, ::g_EnableFastHeapDumps) DEFINE_DACVAR(UNKNOWN_POINTER_TYPE, dac__g_pDebugger, ::g_pDebugger) DEFINE_DACVAR(UNKNOWN_POINTER_TYPE, dac__g_pDebugInterface, ::g_pDebugInterface) DEFINE_DACVAR(UNKNOWN_POINTER_TYPE, dac__g_pEEDbgInterfaceImpl, ::g_pEEDbgInterfaceImpl) diff --git a/src/coreclr/vm/ceemain.cpp b/src/coreclr/vm/ceemain.cpp index b23d544ec73760..dc50b4e6c25283 100644 --- a/src/coreclr/vm/ceemain.cpp +++ b/src/coreclr/vm/ceemain.cpp @@ -625,7 +625,7 @@ void EEStartupHelper() IfFailGo(EEConfig::Setup()); - + g_EnableFastHeapDumps = CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableFastHeapDumps); #ifdef HOST_WINDOWS InitializeCrashDump(); @@ -2235,4 +2235,3 @@ void ContractRegressionCheck() } #endif // ENABLE_CONTRACTS_IMPL - diff --git a/src/coreclr/vm/vars.cpp b/src/coreclr/vm/vars.cpp index 6baded4ca7ddfc..4973b7ac1016e5 100644 --- a/src/coreclr/vm/vars.cpp +++ b/src/coreclr/vm/vars.cpp @@ -144,6 +144,7 @@ ETW::CEtwTracer * g_pEtwTracer = NULL; GPTR_IMPL(DebugInterface, g_pDebugInterface); // A managed debugger may set this flag to high from out of process. GVAL_IMPL_INIT(DWORD, g_CORDebuggerControlFlags, DBCF_NORMAL_OPERATION); +GVAL_IMPL_INIT(DWORD, g_EnableFastHeapDumps, 0); #ifdef DEBUGGING_SUPPORTED GPTR_IMPL(EEDbgInterfaceImpl, g_pEEDbgInterfaceImpl); diff --git a/src/coreclr/vm/vars.hpp b/src/coreclr/vm/vars.hpp index b34ec9426621da..fa8413eb04c2d1 100644 --- a/src/coreclr/vm/vars.hpp +++ b/src/coreclr/vm/vars.hpp @@ -432,6 +432,7 @@ GPTR_DECL(StressLog, g_pStressLog); // GPTR_DECL(DebugInterface, g_pDebugInterface); GVAL_DECL(DWORD, g_CORDebuggerControlFlags); +GVAL_DECL(DWORD, g_EnableFastHeapDumps); #ifdef DEBUGGING_SUPPORTED GPTR_DECL(EEDbgInterfaceImpl, g_pEEDbgInterfaceImpl); #endif // DEBUGGING_SUPPORTED