mirror of
https://github.com/QwenLM/Qwen.git
synced 2026-05-21 00:45:48 +08:00
73
recipes/tests/test_inference/test_inference_vllm_fschat.py
Normal file
73
recipes/tests/test_inference/test_inference_vllm_fschat.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import pytest
|
||||
import subprocess
|
||||
import torch
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/..")
|
||||
from utils import run_in_subprocess, simple_openai_api, TelnetPort
|
||||
from ut_config import (
|
||||
MODEL_TYPE,
|
||||
DOCKER_VERSION_CU121,
|
||||
DOCKER_MOUNT_DIR,
|
||||
DOCKER_TEST_DIR,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_gpus,use_int4",
|
||||
[
|
||||
(1, False),
|
||||
(1, True),
|
||||
(2, False),
|
||||
# ValueError: The input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.
|
||||
# (2, True)
|
||||
],
|
||||
)
|
||||
def test_inference_vllm_fschat(num_gpus, use_int4):
|
||||
model_type = f"{MODEL_TYPE}-Chat-Int4" if use_int4 else f"{MODEL_TYPE}-Chat"
|
||||
container_name = "test_inference_vllm_fschat"
|
||||
cmd_docker = f'docker run --gpus all --ipc=host --network=host --rm --name="{container_name}" -p 8000:8000 -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {DOCKER_VERSION_CU121} /bin/bash -c '
|
||||
cmd = ""
|
||||
|
||||
cmd += f"""nohup python -m fastchat.serve.controller > /dev/null 2>&1 \
|
||||
& python -m fastchat.serve.openai_api_server --host localhost --port 8000 > /dev/null 2>&1 \
|
||||
& python -m fastchat.serve.vllm_worker --model-path {DOCKER_TEST_DIR}/{model_type} --tensor-parallel-size {num_gpus} --trust-remote-code"""
|
||||
|
||||
# for GPUS SM < 80 and use_int==True
|
||||
is_ampere = torch.cuda.get_device_capability()[0] >= 8
|
||||
if not is_ampere or use_int4:
|
||||
cmd += " --dtype half"
|
||||
|
||||
snapshot_download(model_type, cache_dir=".", revision="master")
|
||||
# start model server
|
||||
run_in_subprocess(
|
||||
f'docker rm -f {container_name} 2>/dev/null || echo "The container does not exist."'
|
||||
)
|
||||
print(cmd_docker + f'"{cmd}"')
|
||||
run_in_subprocess("nohup " + cmd_docker + f'"{cmd}"' + " > tmp.log 2>&1 &")
|
||||
|
||||
while not TelnetPort("localhost", 21002):
|
||||
print("Wait for the model service start.")
|
||||
time.sleep(0.5)
|
||||
|
||||
if (
|
||||
subprocess.run(
|
||||
f"docker inspect {container_name}",
|
||||
shell=True,
|
||||
stdout=subprocess.DEVNULL,
|
||||
).returncode
|
||||
!= 0
|
||||
):
|
||||
break
|
||||
|
||||
try:
|
||||
simple_openai_api(model_type.split("/")[-1])
|
||||
except Exception as e:
|
||||
time.sleep(1)
|
||||
with open("tmp.log") as f:
|
||||
raise Exception(f"{e} \n {f.read()}")
|
||||
|
||||
run_in_subprocess(f"docker rm -f {container_name}")
|
||||
Reference in New Issue
Block a user