[Test] Fix flaky tests

vmoens · vmoens · commit 0bc8ed6b9ebe · 2025-11-07T17:27:53.000Z
diff --git a/test/services/test_python_executor_service.py b/test/services/test_python_executor_service.py
@@ -73,7 +73,7 @@ def test_service_execution(self, ray_init):
 result = x + y
 print(f"Result: {result}")
 """
-            result = ray.get(executor.execute.remote(code), timeout=2)
+            result = ray.get(executor.execute.remote(code), timeout=10)
 
             assert result["success"] is True
             assert "Result: 30" in result["stdout"]
@@ -101,7 +101,7 @@ def test_service_execution_error(self, ray_init):
 
             # Execute code with an error
             code = "raise ValueError('Test error')"
-            result = ray.get(executor.execute.remote(code), timeout=2)
+            result = ray.get(executor.execute.remote(code), timeout=10)
 
             assert result["success"] is False
             assert "ValueError: Test error" in result["stderr"]
@@ -119,7 +119,7 @@ def test_multiple_executions(self, ray_init):
                 "python_executor",
                 PythonExecutorService,
                 pool_size=4,
-                timeout=5.0,
+                timeout=10.0,
                 num_cpus=4,
                 max_concurrency=4,
             )
@@ -132,14 +132,16 @@ def test_multiple_executions(self, ray_init):
                 code = f"print('Execution {i}')"
                 futures.append(executor.execute.remote(code))
 
-            # Wait for all to complete
-            results = ray.get(futures, timeout=5)
+            # Wait for all to complete with longer timeout
+            results = ray.get(futures, timeout=30)
 
             # All should succeed
             assert len(results) == 8
             for i, result in enumerate(results):
-                assert result["success"] is True
-                assert f"Execution {i}" in result["stdout"]
+                assert result["success"] is True, f"Execution {i} failed: {result}"
+                assert (
+                    f"Execution {i}" in result["stdout"]
+                ), f"Expected 'Execution {i}' in stdout, got: {result['stdout']!r}"
 
         finally:
             services.reset()
diff --git a/test/test_collector.py b/test/test_collector.py
@@ -13,6 +13,7 @@
 import subprocess
 import sys
 import time
+from contextlib import nullcontext
 from unittest.mock import patch
 
 import numpy as np
@@ -1487,12 +1488,14 @@ def env_fn(seed):
         assert_allclose_td(data10, data20)
 
     @pytest.mark.parametrize("use_async", [False, True])
-    @pytest.mark.parametrize("cudagraph", [False, True])
+    @pytest.mark.parametrize(
+        "cudagraph", [False, True] if torch.cuda.is_available() else [False]
+    )
     @pytest.mark.parametrize(
         "weight_sync_scheme",
         [None, MultiProcessWeightSyncScheme, SharedMemWeightSyncScheme],
     )
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device found")
+    # @pytest.mark.skipif(not torch.cuda.is_available() and not torch.mps.is_available(), reason="no cuda/mps device found")
     def test_update_weights(self, use_async, cudagraph, weight_sync_scheme):
         def create_env():
             return ContinuousActionVecMockEnv()
@@ -1509,11 +1512,12 @@ def create_env():
         kwargs = {}
         if weight_sync_scheme is not None:
             kwargs["weight_sync_schemes"] = {"policy": weight_sync_scheme()}
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
         collector = collector_class(
             [create_env] * 3,
             policy=policy,
-            device=[torch.device("cuda:0")] * 3,
-            storing_device=[torch.device("cuda:0")] * 3,
+            device=[torch.device(device)] * 3,
+            storing_device=[torch.device(device)] * 3,
             frames_per_batch=20,
             cat_results="stack",
             cudagraph_policy=cudagraph,
@@ -1544,7 +1548,9 @@ def create_env():
             # check they don't match
             for worker in range(3):
                 for k in state_dict[f"worker{worker}"]["policy_state_dict"]:
-                    with pytest.raises(AssertionError):
+                    with pytest.raises(
+                        AssertionError
+                    ) if torch.cuda.is_available() else nullcontext():
                         torch.testing.assert_close(
                             state_dict[f"worker{worker}"]["policy_state_dict"][k],
                             policy_state_dict[k].cpu(),
@@ -2401,7 +2407,9 @@ def test_auto_wrap_error(self, collector_class, env_maker, num_envs):
         policy = UnwrappablePolicy(out_features=env_maker().action_spec.shape[-1])
         with pytest.raises(
             TypeError,
-            match=("Arguments to policy.forward are incompatible with entries in"),
+            match=(
+                "Arguments to policy.forward are incompatible with entries in|Failed to wrap the policy. If the policy needs to be trusted, set trust_policy=True."
+            ),
         ):
             collector_class(
                 **self._create_collector_kwargs(
diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py
@@ -135,18 +135,19 @@ class _InterruptorManager(SyncManager):
 _InterruptorManager.register("_Interruptor", _Interruptor)
 
 
-def recursive_map_to_cpu(dictionary: OrderedDict) -> OrderedDict:
-    """Maps the tensors to CPU through a nested dictionary."""
-    return OrderedDict(
-        **{
-            k: recursive_map_to_cpu(item)
-            if isinstance(item, OrderedDict)
-            else item.cpu()
-            if isinstance(item, torch.Tensor)
-            else item
-            for k, item in dictionary.items()
-        }
-    )
+def _map_to_cpu_if_needed(x):
+    """Map tensors on exotic devices (MPS, NPU, etc.) to CPU.
+
+    CPU and CUDA tensors are kept as-is since they can be shared across processes.
+    Only exotic devices that don't support multiprocessing are mapped to CPU.
+    """
+    if isinstance(x, torch.Tensor):
+        # CPU and CUDA can be shared across processes
+        if x.device.type in ("cpu", "cuda"):
+            return x
+        # Exotic devices (MPS, NPU, etc.) need to be mapped to CPU
+        return x.cpu()
+    return x
 
 
 class DataCollectorBase(IterableDataset, metaclass=abc.ABCMeta):
@@ -1149,7 +1150,7 @@ def _setup_policy_and_weights(self, policy: TensorDictModule | Callable) -> None
                 )
             except (TypeError, AttributeError, ValueError) as err:
                 raise TypeError(
-                    "Failed to wrap the policy. If the policy needs to be trusted, set trust_policy=True."
+                    "Failed to wrap the policy. If the policy needs to be trusted, set trust_policy=True. Scroll up for more details."
                 ) from err
             self._wrapped_policy = wrapped_policy
         else:
@@ -4880,9 +4881,12 @@ def cast_tensor(x, MPS_ERROR=MPS_ERROR):
             continue
 
         elif msg == "state_dict":
+            from torch.utils._pytree import tree_map
+
             state_dict = inner_collector.state_dict()
-            # send state_dict to cpu first
-            state_dict = recursive_map_to_cpu(state_dict)
+            # Map exotic devices (MPS, NPU, etc.) to CPU for multiprocessing compatibility
+            # CPU and CUDA tensors are already shareable and don't need conversion
+            state_dict = tree_map(_map_to_cpu_if_needed, state_dict)
             pipe_child.send((state_dict, "state_dict"))
             has_timed_out = False
             continue
diff --git a/torchrl/envs/batched_envs.py b/torchrl/envs/batched_envs.py
@@ -2701,7 +2701,6 @@ def _run_worker_pipe_direct(
             if event is not None:
                 event.record()
                 event.synchronize()
-            mp_event.set()
             if consolidate:
                 try:
                     child_pipe.send(
@@ -2713,6 +2712,9 @@ def _run_worker_pipe_direct(
                     raise RuntimeError(_CONSOLIDATE_ERR_CAPTURE) from err
             else:
                 child_pipe.send(cur_td)
+            # Set event after successfully sending through pipe to avoid race condition
+            # where event is set but pipe send fails (BrokenPipeError)
+            mp_event.set()
 
             del cur_td
 
@@ -2726,7 +2728,6 @@ def _run_worker_pipe_direct(
             if event is not None:
                 event.record()
                 event.synchronize()
-            mp_event.set()
             if consolidate:
                 try:
                     next_td = next_td.consolidate(
@@ -2735,6 +2736,9 @@ def _run_worker_pipe_direct(
                 except Exception as err:
                     raise RuntimeError(_CONSOLIDATE_ERR_CAPTURE) from err
             child_pipe.send(next_td)
+            # Set event after successfully sending through pipe to avoid race condition
+            # where event is set but pipe send fails (BrokenPipeError)
+            mp_event.set()
 
             del next_td
 
diff --git a/torchrl/envs/llm/transforms/tools.py b/torchrl/envs/llm/transforms/tools.py
@@ -906,9 +906,9 @@ def execute(self, prompt: str) -> dict[str, Any]:
                     except queue.Empty:
                         pass
 
-                    if not start_found:
-                        timeout_val -= 0.1
-                        time.sleep(0.1)
+                    # Always sleep a bit to avoid busy-waiting and give subprocess time
+                    timeout_val -= 0.01
+                    time.sleep(0.01)
 
                 except Exception as e:
                     return {
@@ -1007,8 +1007,10 @@ def __init__(self, pool_size: int = 32, timeout: float = 10.0):
         self.processes = [
             PersistentPythonProcess(timeout=timeout) for _ in range(pool_size)
         ]
+        # Create a lock for each process to prevent concurrent access
+        self.process_locks = [threading.Lock() for _ in range(pool_size)]
         self.next_idx = 0
-        self._lock = threading.Lock()
+        self._selection_lock = threading.Lock()
 
     def execute(self, code: str) -> dict:
         """Execute Python code using next available process (round-robin).
@@ -1019,12 +1021,14 @@ def execute(self, code: str) -> dict:
         Returns:
             dict: Execution result with keys 'success', 'stdout', 'stderr', 'returncode'.
         """
-        # Simple round-robin - Ray handles the queuing via max_concurrency
-        with self._lock:
-            process = self.processes[self.next_idx]
+        # Select a process using round-robin
+        with self._selection_lock:
+            process_idx = self.next_idx
             self.next_idx = (self.next_idx + 1) % self.pool_size
 
-        return process.execute(code)
+        # Lock the selected process for the duration of execution
+        with self.process_locks[process_idx]:
+            return self.processes[process_idx].execute(code)
 
     def cleanup(self):
         """Cleanup all processes in the pool."""