init commit of recipes (#1027)

Add recipes
2026-05-20 16:35:47 +08:00 · 2024-01-30 01:57:09 -06:00
parent d275e5b91a
commit ee01f36ed9
30 changed files with 5146 additions and 0 deletions
--- a/recipes/tests/test_inference/test_inference_api.py
+++ b/recipes/tests/test_inference/test_inference_api.py
@@ -0,0 +1,78 @@
+import os
+import sys
+import time
+import pytest
+import subprocess
+import torch
+from modelscope.hub.snapshot_download import snapshot_download
+
+sys.path.append(os.path.dirname(__file__) + "/..")
+from utils import run_in_subprocess, simple_openai_api, TelnetPort
+from ut_config import (
+    MODEL_TYPE,
+    DOCKER_VERSION_CU114,
+    DOCKER_VERSION_CU117,
+    DOCKER_VERSION_CU121,
+    DOCKER_MOUNT_DIR,
+    DOCKER_TEST_DIR,
+)
+
+
+# use_cpu=True,use_int=False RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'
+# use_cpu=True,use_int4=True ValueError: Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU.You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object
+@pytest.mark.parametrize(
+    "docker_version,use_cpu,use_int4",
+    [
+        (DOCKER_VERSION_CU114, False, False),
+        (DOCKER_VERSION_CU114, False, True),
+        (DOCKER_VERSION_CU117, False, False),
+        (DOCKER_VERSION_CU117, False, True),
+        (DOCKER_VERSION_CU121, False, False),
+        (DOCKER_VERSION_CU121, False, True),
+    ],
+)
+def test_inference_api(docker_version, use_cpu, use_int4):
+    container_name = "test_inference_api"
+    model_type = f"{MODEL_TYPE}-Chat-Int4" if use_int4 else f"{MODEL_TYPE}-Chat"
+    cmd_docker = f'docker run --gpus all --ipc=host --network=host --rm --name="{container_name}" -p 8000:8000 -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {docker_version} /bin/bash -c '
+    cmd = ""
+    # for GPUs SM < 80
+    is_ampere = torch.cuda.get_device_capability()[0] >= 8
+    if not is_ampere:
+        cmd += f"pip uninstall -y flash-attn && "
+
+    cmd += f"""python {DOCKER_MOUNT_DIR}/openai_api.py -c {DOCKER_TEST_DIR}/{model_type}"""
+
+    if use_cpu:
+        cmd += " --cpu-only"
+
+    snapshot_download(model_type, cache_dir=".", revision="master")
+    # start model server
+    print(cmd_docker + f'"{cmd}"')
+    run_in_subprocess(
+        f'docker rm -f {container_name} 2>/dev/null || echo "The container does not exist."'
+    )
+    run_in_subprocess("nohup " + cmd_docker + f'"{cmd}"' + " > tmp.log 2>&1 &")
+
+    while not TelnetPort("localhost", 8000):
+        print("Wait for the model service start.")
+        time.sleep(0.5)
+
+        if (
+            subprocess.run(
+                f"docker inspect {container_name}",
+                shell=True,
+                stdout=subprocess.DEVNULL,
+            ).returncode
+            != 0
+        ):
+            break
+    try:
+        # while load int4 model such as Qwen-1_8B-Chat-Int4, the model name is Qwen-1_8B-Chat
+        simple_openai_api(f"{MODEL_TYPE}-Chat".split("/")[-1])
+    except Exception as e:
+        time.sleep(1)
+        with open("tmp.log") as f:
+            raise Exception(f"{e} \n {f.read()}")
+
+    run_in_subprocess(f"docker rm -f {container_name}")
--- a/recipes/tests/test_inference/test_inference_vllm_fschat.py
+++ b/recipes/tests/test_inference/test_inference_vllm_fschat.py
@@ -0,0 +1,73 @@
+import os
+import sys
+import time
+import pytest
+import subprocess
+import torch
+from modelscope.hub.snapshot_download import snapshot_download
+
+sys.path.append(os.path.dirname(__file__) + "/..")
+from utils import run_in_subprocess, simple_openai_api, TelnetPort
+from ut_config import (
+    MODEL_TYPE,
+    DOCKER_VERSION_CU121,
+    DOCKER_MOUNT_DIR,
+    DOCKER_TEST_DIR,
+)
+
+
+@pytest.mark.parametrize(
+    "num_gpus,use_int4",
+    [
+        (1, False),
+        (1, True),
+        (2, False),
+        # ValueError: The input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.
+        # (2, True)
+    ],
+)
+def test_inference_vllm_fschat(num_gpus, use_int4):
+    model_type = f"{MODEL_TYPE}-Chat-Int4" if use_int4 else f"{MODEL_TYPE}-Chat"
+    container_name = "test_inference_vllm_fschat"
+    cmd_docker = f'docker run --gpus all --ipc=host --network=host --rm --name="{container_name}" -p 8000:8000 -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {DOCKER_VERSION_CU121} /bin/bash -c '
+    cmd = ""
+
+    cmd += f"""nohup python -m fastchat.serve.controller > /dev/null 2>&1 \
+    & python -m fastchat.serve.openai_api_server --host localhost --port 8000 > /dev/null 2>&1 \
+    & python -m fastchat.serve.vllm_worker --model-path {DOCKER_TEST_DIR}/{model_type} --tensor-parallel-size {num_gpus} --trust-remote-code"""
+
+    # for GPUS SM < 80 and use_int==True
+    is_ampere = torch.cuda.get_device_capability()[0] >= 8
+    if not is_ampere or use_int4:
+        cmd += " --dtype half"
+
+    snapshot_download(model_type, cache_dir=".", revision="master")
+    # start model server
+    run_in_subprocess(
+        f'docker rm -f {container_name} 2>/dev/null || echo "The container does not exist."'
+    )
+    print(cmd_docker + f'"{cmd}"')
+    run_in_subprocess("nohup " + cmd_docker + f'"{cmd}"' + " > tmp.log 2>&1 &")
+
+    while not TelnetPort("localhost", 21002):
+        print("Wait for the model service start.")
+        time.sleep(0.5)
+
+        if (
+            subprocess.run(
+                f"docker inspect {container_name}",
+                shell=True,
+                stdout=subprocess.DEVNULL,
+            ).returncode
+            != 0
+        ):
+            break
+
+    try:
+        simple_openai_api(model_type.split("/")[-1])
+    except Exception as e:
+        time.sleep(1)
+        with open("tmp.log") as f:
+            raise Exception(f"{e} \n {f.read()}")
+
+    run_in_subprocess(f"docker rm -f {container_name}")