[build] Monkeypatch gen_rst to call inside subprocess (#3355)

clee2000 · web-flow · commit 78933b112bcc · 2025-05-15T13:20:18.000-07:00
Replace sphinx gallery's generate_file_rst call with a wrapper that calls that function but in a subprocess, and remove some things that we added in order to handle files changing global state I tried using multiprocess.pool and apply async but ran into pickling problems. I don't understand why this current implementation doesn't have that issue parallelism got added in some version of sphinx gallery, but it uses joblib, which seems to result in errors (conflicts with functions in subprocess library? im not sure). Additionally, it has no effect if you set parallel = 1 (it will not put each file run into its own process and run singly) so you need parallel >= 2, but I'm not sure if there are any tutorials that require being run on their own (ex for memory profiling) Afaict (although I am not familiar with joblib) the way I do it here is similar to how sphinx gallery does it with job lib but they can wrap the actual call instead of needing to replace the function https://github.com/sphinx-gallery/sphinx-gallery/blob/dd092a09513ea1d0616ac9e59b8d76d5a8217e4a/sphinx_gallery/gen_rst.py#L594-L608 I also tried to call sphinx once per file in #3351 but there's an over head of ~5 min per file due to sphinx doing extra stuff (generating the html for all the other files?) resulting in taking 2x longer
diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
@@ -53,7 +53,6 @@
     "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
     "advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.
     "intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixe
-    "intermediate_source/memory_format_tutorial", # causes other tutorials like torch_logs fail. "state" issue, reseting dynamo didn't help
 ]
 
 def tutorial_source_dirs() -> List[Path]:
diff --git a/conf.py b/conf.py
@@ -33,8 +33,6 @@
 sys.path.insert(0, os.path.abspath('./.jenkins'))
 import pytorch_sphinx_theme
 import torch
-import numpy
-import gc
 import glob
 import random
 import shutil
@@ -49,6 +47,46 @@
 pio.renderers.default = 'sphinx_gallery'
 
 
+import sphinx_gallery.gen_rst
+import multiprocessing
+
+# Monkey patch sphinx gallery to run each example in an isolated process so that
+# we don't need to worry about examples changing global state.
+#
+# Alt option 1: Parallelism was added to sphinx gallery (a later version that we
+# are not using yet) using joblib, but it seems to result in errors for us, and
+# it has no effect if you set parallel = 1 (it will not put each file run into
+# its own process and run singly) so you need parallel >= 2, and there may be
+# tutorials that cannot be run in parallel.
+#
+# Alt option 2: Run sphinx gallery once per file (similar to how we shard in CI
+# but with shard sizes of 1), but running sphinx gallery for each file has a
+# ~5min overhead, resulting in the entire suite taking ~2x time
+def call_fn(func, args, kwargs, result_queue):
+    try:
+        result = func(*args, **kwargs)
+        result_queue.put((True, result))
+    except Exception as e:
+        result_queue.put((False, str(e)))
+
+def call_in_subprocess(func):
+    def wrapper(*args, **kwargs):
+        result_queue = multiprocessing.Queue()
+        p = multiprocessing.Process(
+            target=call_fn,
+            args=(func, args, kwargs, result_queue)
+        )
+        p.start()
+        p.join()
+        success, result = result_queue.get()
+        if success:
+            return result
+        else:
+            raise RuntimeError(f"Error in subprocess: {result}")
+    return wrapper
+
+sphinx_gallery.gen_rst.generate_file_rst = call_in_subprocess(sphinx_gallery.gen_rst.generate_file_rst)
+
 try:
     import torchvision
 except ImportError:
@@ -97,20 +135,6 @@
 
 # -- Sphinx-gallery configuration --------------------------------------------
 
-def reset_seeds(gallery_conf, fname):
-    torch.cuda.empty_cache()
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-    torch._dynamo.reset()
-    torch._inductor.config.force_disable_caches = True
-    torch.manual_seed(42)
-    torch.set_default_device(None)
-    random.seed(10)
-    numpy.random.seed(10)
-    torch.set_grad_enabled(True)
-
-    gc.collect()
-
 sphinx_gallery_conf = {
     'examples_dirs': ['beginner_source', 'intermediate_source',
                       'advanced_source', 'recipes_source', 'prototype_source'],
@@ -121,7 +145,6 @@ def reset_seeds(gallery_conf, fname):
     'first_notebook_cell': ("# For tips on running notebooks in Google Colab, see\n"
                             "# https://pytorch.org/tutorials/beginner/colab\n"
                             "%matplotlib inline"),
-    'reset_modules': (reset_seeds),
     'ignore_pattern': r'_torch_export_nightly_tutorial.py',
     'pypandoc': {'extra_args': ['--mathjax', '--toc'],
                  'filters': ['.jenkins/custom_pandoc_filter.py'],

Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,6 @@`
`53`	`53`	`"intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.`
`54`	`54`	`"advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.`
`55`	`55`	`"intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixe`
`56`		`- "intermediate_source/memory_format_tutorial", # causes other tutorials like torch_logs fail. "state" issue, reseting dynamo didn't help`
`57`	`56`	`]`
`58`	`57`
`59`	`58`	`def tutorial_source_dirs() -> List[Path]:`