run_vllm_multi_node.sh
跟run_vllm_single_node.sh一样,run_vllm_multi_node.sh也是用于启动与 OpenAI API 兼容的在线推理服务。在Cluster部署准备章节会使用到。
不同的是,run_vllm_multi_node.sh可在多节点上执行,并且通过Ray管理多节点的分布式推理服务。
source /home/ma-user/.bashrc # 等待rank table export START_UP_GLOBAL_RANK_TABLE_FILE_PATH="$GLOBAL_RANK_TABLE_FILE_PATH" python3 /mnt/deepseek/scripts/wait_ki_rank_table_completed.py export HEAD_IP=$(PYTHONUNBUFFERED=1 python3 /mnt/deepseek/scripts/get_ip_list.py "0") export USE_MM_ALL_REDUCE_OP=1 export ASCEND_TURBO_TASK_QUEUE=0 # 获取通信网卡 VPC_PREFIX=$(echo "${HEAD_IP}" | cut -d'/' -f1 | cut -d'.' -f1-2) POD_INET_IP=$(ifconfig | grep -oP "(?<=inet\s)$VPC_PREFIX\.\d+\.\d+" | head -n 1) POD_NETWORK_IFNAME=$(ifconfig | grep -B 1 "$POD_INET_IP" | head -n 1 | awk '{print $1}' | sed 's/://') echo "POD_INET_IP: $POD_INET_IP" echo "POD_NETWORK_IFNAME: $POD_NETWORK_IFNAME" # 指定通信网卡 export GLOO_SOCKET_IFNAME=$POD_NETWORK_IFNAME export TP_SOCKET_IFNAME=$POD_NETWORK_IFNAME export HCCL_SOCKET_IFNAME=$POD_NETWORK_IFNAME export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 export NUMEXPR_MAX_THREADS=192 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True # 配置通信算法的编排展开位置在Device侧的AI Vector Core计算单元 export HCCL_OP_EXPANSION_MODE=AIV # 指定可使用的卡 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # CPU端算子任务的处理器亲和性,即设定任务绑核。0或未设置:表示不启用绑核功能。1:表示开启粗粒度绑核。2:表示开启细粒度绑核。 export CPU_AFFINITY_CONF=1 echo "start actual vllm serve............................" if [ "$1" == "head" ]; then ray start --head --num-gpus=8 # 等待ray worker连接。注意对于4机需要修改第二个入参为4 python3 /mnt/deepseek/scripts/ray_worker_checker.py "${HEAD_IP}:6379" 2 python -m vllm.entrypoints.openai.api_server \ --model /mnt/deepseek/model/${model_name} \ --served-model-name=${model_name} \ --host=0.0.0.0 \ --port=8080 \ --tensor-parallel-size=16 \ --gpu-memory-utilization=0.95 \ --block-size=128 \ --max-num-seqs=256 \ --max-model-len=32768 \ --max-num-batched-tokens=32768 \ --trust-remote-code \ --distributed-executor-backend=ray fi if [ "$1" == "worker" ]; then command="ray start --address='${HEAD_IP}:6379' --num-gpus=8 &> /dev/null" echo $command while true; do eval $command if [ $? -eq 0 ]; then echo "succeed to connect to ray head node" break else echo "failed to connect to ray head node, wait 5s....." sleep 5 fi done echo "hold process..." while :; do sleep 2073600; done fi