Skip to content

Commit adf5919

Browse files
ezyangpytorchmergebot
authored andcommitted
Add option to record C++ backtraces in _record_memory_history (pytorch#86145)
I used this to debug pytorch#86136 so it is useful. The implementation is not so fast so it is not enabled by default. Signed-off-by: Edward Z. Yang <[email protected]> Pull Request resolved: pytorch#86145 Approved by: https://github.com/albanD, https://github.com/zdevito
1 parent 97d6b5b commit adf5919

File tree

4 files changed

+58
-6
lines changed

4 files changed

+58
-6
lines changed

test/test_cuda.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4637,6 +4637,27 @@ def test_memory_snapshot(self):
46374637
torch.cuda.memory._record_memory_history(False)
46384638

46394639

4640+
def test_memory_snapshot_with_cpp(self):
4641+
try:
4642+
torch.cuda.memory.empty_cache()
4643+
torch.cuda.memory._record_memory_history(True, _enable_expensive_cpp=True)
4644+
x = torch.rand(311, 411, device='cuda')
4645+
4646+
ss = torch.cuda.memory._snapshot()
4647+
found_it = False
4648+
for seg in ss:
4649+
for b in seg['blocks']:
4650+
if 'history' in b:
4651+
for h in b['history']:
4652+
if h['real_size'] == 311 * 411 * 4:
4653+
self.assertNotEqual(len(h['cpp_frames']), 0)
4654+
found_it = True
4655+
self.assertTrue(found_it)
4656+
4657+
finally:
4658+
torch.cuda.memory._record_memory_history(False)
4659+
4660+
46404661
def test_allocator_settings(self):
46414662
def power2_div(size, div_factor):
46424663
pow2 = 1

torch/_C/__init__.pyi.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1181,7 +1181,7 @@ def _cuda_memoryStats(device: _int) -> Dict[str, Any]: ...
11811181
def _cuda_resetAccumulatedMemoryStats(device: _int) -> None: ...
11821182
def _cuda_resetPeakMemoryStats(device: _int) -> None: ...
11831183
def _cuda_memorySnapshot() -> List[Dict[str, Any]]: ...
1184-
def _cuda_recordMemoryHistory(enabled: _bool) -> None: ...
1184+
def _cuda_recordMemoryHistory(enabled: _bool, cpp: _bool) -> None: ...
11851185
def _cuda_lock_mutex() -> None: ...
11861186
def _cuda_unlock_mutex() -> None: ...
11871187
def _cuda_canDeviceAccessPeer(device: _int, peer_device: _int) -> _bool: ...

torch/csrc/cuda/Module.cpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -536,13 +536,15 @@ struct Frame {
536536

537537
struct StackContext : public c10::cuda::CUDACachingAllocator::Context {
538538
std::vector<Frame> frames;
539+
// Empty if cpp traces weren't enabled
540+
std::string cpp_frames;
539541
~StackContext() {
540542
py::gil_scoped_acquire acquire;
541543
for (auto& f : frames) {
542544
Py_XDECREF((PyObject*)f.code);
543545
}
544546
}
545-
static std::unique_ptr<c10::cuda::CUDACachingAllocator::Context> gather() {
547+
static std::unique_ptr<StackContext> _gather() {
546548
py::gil_scoped_acquire acquire;
547549
auto r = std::make_unique<StackContext>();
548550
PyFrameObject* f = PyEval_GetFrame();
@@ -555,6 +557,15 @@ struct StackContext : public c10::cuda::CUDACachingAllocator::Context {
555557
}
556558
return r;
557559
}
560+
static std::unique_ptr<c10::cuda::CUDACachingAllocator::Context> gather() {
561+
return _gather();
562+
}
563+
static std::unique_ptr<c10::cuda::CUDACachingAllocator::Context>
564+
gather_with_cpp() {
565+
auto r = _gather();
566+
r->cpp_frames = c10::get_backtrace();
567+
return std::move(r);
568+
}
558569
};
559570

560571
PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
@@ -584,6 +595,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
584595
py::str name_s = "name";
585596
py::str line_s = "line";
586597
py::str frames_s = "frames";
598+
py::str cpp_frames_s = "cpp_frames";
587599
py::str history_s = "history";
588600
py::str blocks_s = "blocks";
589601

@@ -626,6 +638,9 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
626638
frame[line_s] = PyCode_Addr2Line(f.code, f.lasti);
627639
frames.append(std::move(frame));
628640
}
641+
if (!sc->cpp_frames.empty()) {
642+
history_entry[cpp_frames_s] = py::cast(sc->cpp_frames);
643+
}
629644
history_entry[frames_s] = std::move(frames);
630645
}
631646
h = h->next.get();
@@ -725,9 +740,10 @@ static void registerCudaDeviceProperties(PyObject* module) {
725740
return stream.str();
726741
});
727742

728-
m.def("_cuda_recordMemoryHistory", [](bool enabled) {
743+
m.def("_cuda_recordMemoryHistory", [](bool enabled, bool cpp) {
729744
c10::cuda::CUDACachingAllocator::setContextRecorder(
730-
enabled ? StackContext::gather : nullptr);
745+
enabled ? (cpp ? StackContext::gather_with_cpp : StackContext::gather)
746+
: nullptr);
731747
});
732748
}
733749

torch/cuda/memory.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -595,9 +595,24 @@ def mem_get_info(device: Union[Device, int] = None) -> Tuple[int, int]:
595595
device = _get_device_index(device)
596596
return torch.cuda.cudart().cudaMemGetInfo(device)
597597

598-
def _record_memory_history(enabled: bool, device: Union[Device, int] = None):
598+
def _record_memory_history(enabled: bool, device: Union[Device, int] = None, *, _enable_expensive_cpp: bool = False):
599+
"""Enables recording of Python stack traces to be associated with memory
600+
allocations, so you can tell what allocated any piece of memory in
601+
:func:`torch.memory_snapshot`.
602+
603+
The Python trace collection is fast (2us per trace), so you may consider
604+
enabling this on production jobs if you anticipate ever having to debug
605+
memory issues.
606+
607+
.. warning:
608+
The :attr:`_enable_expensive_cpp` arguments lets you enable also
609+
collecting C++ stack traces. This collection is VERY SLOW and should
610+
only be used if you are debugging framework problems on a minified
611+
example. In principle, it should be possible to implement fast C++
612+
stack trace collection; file an issue with us if you need it.
613+
"""
599614
with torch.cuda.device(device):
600-
_C._cuda_recordMemoryHistory(enabled)
615+
_C._cuda_recordMemoryHistory(enabled, _enable_expensive_cpp)
601616

602617
def _snapshot(device: Union[Device, int] = None):
603618
with torch.cuda.device(device):

0 commit comments

Comments
 (0)