更新时间:2025-07-29 GMT+08:00
分享

run_vllm_multi_node.sh

跟run_vllm_single_node.sh一样,run_vllm_multi_node.sh也是用于启动与 OpenAI API 兼容的在线推理服务。在Cluster部署准备章节会使用到。

不同的是,run_vllm_multi_node.sh可在多节点上执行,并且通过Ray管理多节点的分布式推理服务。

source /home/ma-user/.bashrc

# 等待rank table
export START_UP_GLOBAL_RANK_TABLE_FILE_PATH="$GLOBAL_RANK_TABLE_FILE_PATH"
python3 /mnt/deepseek/scripts/wait_ki_rank_table_completed.py

export HEAD_IP=$(PYTHONUNBUFFERED=1 python3 /mnt/deepseek/scripts/get_ip_list.py "0")
export USE_MM_ALL_REDUCE_OP=1
export ASCEND_TURBO_TASK_QUEUE=0

# 获取通信网卡
VPC_PREFIX=$(echo "${HEAD_IP}" | cut -d'/' -f1 | cut -d'.' -f1-2)
POD_INET_IP=$(ifconfig | grep -oP "(?<=inet\s)$VPC_PREFIX\.\d+\.\d+" | head -n 1)
POD_NETWORK_IFNAME=$(ifconfig | grep -B 1 "$POD_INET_IP" | head -n 1 | awk '{print $1}' | sed 's/://')
echo "POD_INET_IP: $POD_INET_IP"
echo "POD_NETWORK_IFNAME: $POD_NETWORK_IFNAME"

# 指定通信网卡
export GLOO_SOCKET_IFNAME=$POD_NETWORK_IFNAME
export TP_SOCKET_IFNAME=$POD_NETWORK_IFNAME
export HCCL_SOCKET_IFNAME=$POD_NETWORK_IFNAME

export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
export NUMEXPR_MAX_THREADS=192
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
# 配置通信算法的编排展开位置在Device侧的AI Vector Core计算单元
export HCCL_OP_EXPANSION_MODE=AIV
# 指定可使用的卡
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# CPU端算子任务的处理器亲和性,即设定任务绑核。0或未设置:表示不启用绑核功能。1:表示开启粗粒度绑核。2:表示开启细粒度绑核。
export CPU_AFFINITY_CONF=1

echo "start actual vllm serve............................"
if [ "$1" == "head" ]; then
  ray start --head --num-gpus=8
  # 等待ray worker连接。注意对于4机需要修改第二个入参为4
  python3 /mnt/deepseek/scripts/ray_worker_checker.py "${HEAD_IP}:6379" 2

  python -m vllm.entrypoints.openai.api_server \
  --model /mnt/deepseek/model/${model_name} \
  --served-model-name=${model_name} \
  --host=0.0.0.0 \
  --port=8080 \
  --tensor-parallel-size=16 \
  --gpu-memory-utilization=0.95 \
  --block-size=128 \
  --max-num-seqs=256 \
  --max-model-len=32768 \
  --max-num-batched-tokens=32768 \
  --trust-remote-code \
  --distributed-executor-backend=ray
fi

if [ "$1" == "worker" ]; then
  command="ray start --address='${HEAD_IP}:6379' --num-gpus=8 &> /dev/null"
  echo $command
  while true; do
      eval $command
      if [ $? -eq 0 ]; then
          echo "succeed to connect to ray head node"
          break
      else
          echo "failed to connect to ray head node, wait 5s....."
          sleep 5
      fi
  done

  echo "hold process..."
  while :; do sleep 2073600; done
fi

相关文档