Add option to record C++ backtraces in _record_memory_history (pytorch#86145)

ezyang · pytorchmergebot · commit adf5919720c0 · 2022-10-06T04:07:37.000Z
I used this to debug pytorch#86136 so it is useful. The implementation is not so fast so it is not enabled by default. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Pull Request resolved: pytorch#86145 Approved by: https://github.com/albanD, https://github.com/zdevito
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -4637,6 +4637,27 @@ def test_memory_snapshot(self):
             torch.cuda.memory._record_memory_history(False)
 
 
+    def test_memory_snapshot_with_cpp(self):
+        try:
+            torch.cuda.memory.empty_cache()
+            torch.cuda.memory._record_memory_history(True, _enable_expensive_cpp=True)
+            x = torch.rand(311, 411, device='cuda')
+
+            ss = torch.cuda.memory._snapshot()
+            found_it = False
+            for seg in ss:
+                for b in seg['blocks']:
+                    if 'history' in b:
+                        for h in b['history']:
+                            if h['real_size'] == 311 * 411 * 4:
+                                self.assertNotEqual(len(h['cpp_frames']), 0)
+                                found_it = True
+            self.assertTrue(found_it)
+
+        finally:
+            torch.cuda.memory._record_memory_history(False)
+
+
     def test_allocator_settings(self):
         def power2_div(size, div_factor):
             pow2 = 1
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -1181,7 +1181,7 @@ def _cuda_memoryStats(device: _int) -> Dict[str, Any]: ...
 def _cuda_resetAccumulatedMemoryStats(device: _int) -> None: ...
 def _cuda_resetPeakMemoryStats(device: _int) -> None: ...
 def _cuda_memorySnapshot() -> List[Dict[str, Any]]: ...
-def _cuda_recordMemoryHistory(enabled: _bool) -> None: ...
+def _cuda_recordMemoryHistory(enabled: _bool, cpp: _bool) -> None: ...
 def _cuda_lock_mutex() -> None: ...
 def _cuda_unlock_mutex() -> None: ...
 def _cuda_canDeviceAccessPeer(device: _int, peer_device: _int) -> _bool: ...
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
@@ -536,13 +536,15 @@ struct Frame {
 
 struct StackContext : public c10::cuda::CUDACachingAllocator::Context {
   std::vector<Frame> frames;
+  // Empty if cpp traces weren't enabled
+  std::string cpp_frames;
   ~StackContext() {
     py::gil_scoped_acquire acquire;
     for (auto& f : frames) {
       Py_XDECREF((PyObject*)f.code);
     }
   }
-  static std::unique_ptr<c10::cuda::CUDACachingAllocator::Context> gather() {
+  static std::unique_ptr<StackContext> _gather() {
     py::gil_scoped_acquire acquire;
     auto r = std::make_unique<StackContext>();
     PyFrameObject* f = PyEval_GetFrame();
@@ -555,6 +557,15 @@ struct StackContext : public c10::cuda::CUDACachingAllocator::Context {
     }
     return r;
   }
+  static std::unique_ptr<c10::cuda::CUDACachingAllocator::Context> gather() {
+    return _gather();
+  }
+  static std::unique_ptr<c10::cuda::CUDACachingAllocator::Context>
+  gather_with_cpp() {
+    auto r = _gather();
+    r->cpp_frames = c10::get_backtrace();
+    return std::move(r);
+  }
 };
 
 PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
@@ -584,6 +595,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
   py::str name_s = "name";
   py::str line_s = "line";
   py::str frames_s = "frames";
+  py::str cpp_frames_s = "cpp_frames";
   py::str history_s = "history";
   py::str blocks_s = "blocks";
 
@@ -626,6 +638,9 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
               frame[line_s] = PyCode_Addr2Line(f.code, f.lasti);
               frames.append(std::move(frame));
             }
+            if (!sc->cpp_frames.empty()) {
+              history_entry[cpp_frames_s] = py::cast(sc->cpp_frames);
+            }
             history_entry[frames_s] = std::move(frames);
           }
           h = h->next.get();
@@ -725,9 +740,10 @@ static void registerCudaDeviceProperties(PyObject* module) {
         return stream.str();
       });
 
-  m.def("_cuda_recordMemoryHistory", [](bool enabled) {
+  m.def("_cuda_recordMemoryHistory", [](bool enabled, bool cpp) {
     c10::cuda::CUDACachingAllocator::setContextRecorder(
-        enabled ? StackContext::gather : nullptr);
+        enabled ? (cpp ? StackContext::gather_with_cpp : StackContext::gather)
+                : nullptr);
   });
 }
 
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
@@ -595,9 +595,24 @@ def mem_get_info(device: Union[Device, int] = None) -> Tuple[int, int]:
     device = _get_device_index(device)
     return torch.cuda.cudart().cudaMemGetInfo(device)
 
-def _record_memory_history(enabled: bool, device: Union[Device, int] = None):
+def _record_memory_history(enabled: bool, device: Union[Device, int] = None, *, _enable_expensive_cpp: bool = False):
+    """Enables recording of Python stack traces to be associated with memory
+    allocations, so you can tell what allocated any piece of memory in
+    :func:`torch.memory_snapshot`.
+
+    The Python trace collection is fast (2us per trace), so you may consider
+    enabling this on production jobs if you anticipate ever having to debug
+    memory issues.
+
+    .. warning:
+        The :attr:`_enable_expensive_cpp` arguments lets you enable also
+        collecting C++ stack traces.  This collection is VERY SLOW and should
+        only be used if you are debugging framework problems on a minified
+        example.  In principle, it should be possible to implement fast C++
+        stack trace collection; file an issue with us if you need it.
+    """
     with torch.cuda.device(device):
-        _C._cuda_recordMemoryHistory(enabled)
+        _C._cuda_recordMemoryHistory(enabled, _enable_expensive_cpp)
 
 def _snapshot(device: Union[Device, int] = None):
     with torch.cuda.device(device):