Skip to content

Commit 78933b1

Browse files
authored
[build] Monkeypatch gen_rst to call inside subprocess (#3355)
Replace sphinx gallery's generate_file_rst call with a wrapper that calls that function but in a subprocess, and remove some things that we added in order to handle files changing global state I tried using multiprocess.pool and apply async but ran into pickling problems. I don't understand why this current implementation doesn't have that issue parallelism got added in some version of sphinx gallery, but it uses joblib, which seems to result in errors (conflicts with functions in subprocess library? im not sure). Additionally, it has no effect if you set parallel = 1 (it will not put each file run into its own process and run singly) so you need parallel >= 2, but I'm not sure if there are any tutorials that require being run on their own (ex for memory profiling) Afaict (although I am not familiar with joblib) the way I do it here is similar to how sphinx gallery does it with job lib but they can wrap the actual call instead of needing to replace the function https://github.com/sphinx-gallery/sphinx-gallery/blob/dd092a09513ea1d0616ac9e59b8d76d5a8217e4a/sphinx_gallery/gen_rst.py#L594-L608 I also tried to call sphinx once per file in #3351 but there's an over head of ~5 min per file due to sphinx doing extra stuff (generating the html for all the other files?) resulting in taking 2x longer
1 parent b2e614d commit 78933b1

File tree

2 files changed

+40
-18
lines changed

2 files changed

+40
-18
lines changed

.jenkins/validate_tutorials_built.py

-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@
5353
"intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
5454
"advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.
5555
"intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixe
56-
"intermediate_source/memory_format_tutorial", # causes other tutorials like torch_logs fail. "state" issue, reseting dynamo didn't help
5756
]
5857

5958
def tutorial_source_dirs() -> List[Path]:

conf.py

+40-17
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@
3333
sys.path.insert(0, os.path.abspath('./.jenkins'))
3434
import pytorch_sphinx_theme
3535
import torch
36-
import numpy
37-
import gc
3836
import glob
3937
import random
4038
import shutil
@@ -49,6 +47,46 @@
4947
pio.renderers.default = 'sphinx_gallery'
5048

5149

50+
import sphinx_gallery.gen_rst
51+
import multiprocessing
52+
53+
# Monkey patch sphinx gallery to run each example in an isolated process so that
54+
# we don't need to worry about examples changing global state.
55+
#
56+
# Alt option 1: Parallelism was added to sphinx gallery (a later version that we
57+
# are not using yet) using joblib, but it seems to result in errors for us, and
58+
# it has no effect if you set parallel = 1 (it will not put each file run into
59+
# its own process and run singly) so you need parallel >= 2, and there may be
60+
# tutorials that cannot be run in parallel.
61+
#
62+
# Alt option 2: Run sphinx gallery once per file (similar to how we shard in CI
63+
# but with shard sizes of 1), but running sphinx gallery for each file has a
64+
# ~5min overhead, resulting in the entire suite taking ~2x time
65+
def call_fn(func, args, kwargs, result_queue):
66+
try:
67+
result = func(*args, **kwargs)
68+
result_queue.put((True, result))
69+
except Exception as e:
70+
result_queue.put((False, str(e)))
71+
72+
def call_in_subprocess(func):
73+
def wrapper(*args, **kwargs):
74+
result_queue = multiprocessing.Queue()
75+
p = multiprocessing.Process(
76+
target=call_fn,
77+
args=(func, args, kwargs, result_queue)
78+
)
79+
p.start()
80+
p.join()
81+
success, result = result_queue.get()
82+
if success:
83+
return result
84+
else:
85+
raise RuntimeError(f"Error in subprocess: {result}")
86+
return wrapper
87+
88+
sphinx_gallery.gen_rst.generate_file_rst = call_in_subprocess(sphinx_gallery.gen_rst.generate_file_rst)
89+
5290
try:
5391
import torchvision
5492
except ImportError:
@@ -97,20 +135,6 @@
97135

98136
# -- Sphinx-gallery configuration --------------------------------------------
99137

100-
def reset_seeds(gallery_conf, fname):
101-
torch.cuda.empty_cache()
102-
torch.backends.cudnn.deterministic = True
103-
torch.backends.cudnn.benchmark = False
104-
torch._dynamo.reset()
105-
torch._inductor.config.force_disable_caches = True
106-
torch.manual_seed(42)
107-
torch.set_default_device(None)
108-
random.seed(10)
109-
numpy.random.seed(10)
110-
torch.set_grad_enabled(True)
111-
112-
gc.collect()
113-
114138
sphinx_gallery_conf = {
115139
'examples_dirs': ['beginner_source', 'intermediate_source',
116140
'advanced_source', 'recipes_source', 'prototype_source'],
@@ -121,7 +145,6 @@ def reset_seeds(gallery_conf, fname):
121145
'first_notebook_cell': ("# For tips on running notebooks in Google Colab, see\n"
122146
"# https://pytorch.org/tutorials/beginner/colab\n"
123147
"%matplotlib inline"),
124-
'reset_modules': (reset_seeds),
125148
'ignore_pattern': r'_torch_export_nightly_tutorial.py',
126149
'pypandoc': {'extra_args': ['--mathjax', '--toc'],
127150
'filters': ['.jenkins/custom_pandoc_filter.py'],

0 commit comments

Comments
 (0)