-
Notifications
You must be signed in to change notification settings - Fork 160
Description
728604d04ba41d42adb9ce801000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7fa4d9b7f590>)
(WorkerDict pid=113813) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(WorkerDict pid=113813) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(WorkerDict pid=113813) File "/data/team/xuelei12/WebAgent/RL_Factory/RL-Factory/verl/single_controller/ray/base.py", line 723, in init
(WorkerDict pid=113813) self.worker_dict[key] = user_defined_cls(*init_args_dict[key].get("args", ()), **init_args_dict[key].get("kwargs", {}))
(WorkerDict pid=113813) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(WorkerDict pid=113813) File "/data/team/xuelei12/WebAgent/RL_Factory/RL-Factory/verl/workers/fsdp_workers.py", line 112, in init
(WorkerDict pid=113813) torch.distributed.init_process_group(backend="cpu:gloo,cuda:nccl" if is_cuda_available else "cpu:gloo,npu:hccl", rank=rank, world_size=world_size)
(WorkerDict pid=113813) File "/root/miniconda3/envs/rl_facory_py311/lib/python3.11/site-packages/torch/distributed/c10d_logger.py", line 81, in wrapper
(WorkerDict pid=113813) return func(*args, **kwargs)
(WorkerDict pid=113813) ^^^^^^^^^^^^^^^^^^^^^
(WorkerDict pid=113813) File "/root/miniconda3/envs/rl_facory_py311/lib/python3.11/site-packages/torch/distributed/c10d_logger.py", line 95, in wrapper
(WorkerDict pid=113813) func_return = func(*args, **kwargs)
(WorkerDict pid=113813) ^^^^^^^^^^^^^^^^^^^^^
(WorkerDict pid=113813) File "/root/miniconda3/envs/rl_facory_py311/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py", line 1714, in init_process_group
(WorkerDict pid=113813) store, rank, world_size = next(rendezvous_iterator)
(WorkerDict pid=113813) ^^^^^^^^^^^^^^^^^^^^^^^^^
(WorkerDict pid=113813) File "/root/miniconda3/envs/rl_facory_py311/lib/python3.11/site-packages/torch/distributed/rendezvous.py", line 274, in _env_rendezvous_handler
(WorkerDict pid=113813) store = _create_c10d_store(
(WorkerDict pid=113813) ^^^^^^^^^^^^^^^^^^^
(WorkerDict pid=113813) File "/root/miniconda3/envs/rl_facory_py311/lib/python3.11/site-packages/torch/distributed/rendezvous.py", line 194, in _create_c10d_store
(WorkerDict pid=113813) return TCPStore(
(WorkerDict pid=113813) ^^^^^^^^^
(WorkerDict pid=113813) RuntimeError: The server socket has failed to listen on any local network address. port: 60107, useIpv6: 0, code: -98, name: EADDRINUSE, message: address already in use