更新时间:2025-07-29 GMT+08:00
分享

ray_worker_checker.py

ray_worker_checker.py脚本用于等待Ray worker节点连接完成。

run_vllm_multi_node.sh中会用到,无需修改。

import ray
import sys
import time

def wait_ray_workers_ready(head_addr: str, target_count: int):
    ray.init(address=head_addr)
    while True:
        try:
            nodes = ray.nodes()
            if len(nodes) == target_count:
                print("ray workers are ready.")
                break
            else:
                print("ray workers not ready, check 10s later....")
                time.sleep(10)
        except Exception as e:
            print(e)

if __name__ == "__main__":
    address = sys.argv[1]
    num_nodes = int(sys.argv[2])
    wait_ray_workers_ready(address, num_nodes)

相关文档