mirror of
https://github.com/QwenLM/Qwen.git
synced 2026-05-20 16:35:47 +08:00
13
recipes/tests/README.md
Normal file
13
recipes/tests/README.md
Normal file
@@ -0,0 +1,13 @@
|
||||
# Unit testing
|
||||
- Run all unit testing
|
||||
```bash
|
||||
cd tests && pytest -s
|
||||
```
|
||||
- Run unit testing under a single folder
|
||||
```bash
|
||||
cd tests && pytest -s {dir}
|
||||
```
|
||||
- Rerun the test cases that failed in the last run
|
||||
```bash
|
||||
cd tests && pytest -s --lf
|
||||
```
|
||||
0
recipes/tests/__init__.py
Normal file
0
recipes/tests/__init__.py
Normal file
1
recipes/tests/assets/test_sampled_qwen.json
Normal file
1
recipes/tests/assets/test_sampled_qwen.json
Normal file
@@ -0,0 +1 @@
|
||||
[{"conversations": [{"from": "user", "value": "你好"}, {"from": "assistant", "value": "你好!很高兴为你提供帮助。"}], "id": "identity_0"}, {"conversations": [{"from": "user", "value": "给我讲一个年轻人奋斗创业最终取得成功的故事。"}, {"from": "assistant", "value": "这是一个关于一个年轻人奋斗创业最终取得成功的故事。"}], "id": "identity_1"}]
|
||||
100
recipes/tests/test_finetune/test_finetune_ds.py
Normal file
100
recipes/tests/test_finetune/test_finetune_ds.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
import shutil
|
||||
from itertools import product
|
||||
import torch
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/..")
|
||||
from utils import run_in_subprocess
|
||||
from ut_config import (
|
||||
MODEL_TYPE,
|
||||
DOCKER_VERSION_CU114,
|
||||
DOCKER_VERSION_CU117,
|
||||
DOCKER_VERSION_CU121,
|
||||
DOCKER_MOUNT_DIR,
|
||||
DOCKER_TEST_DIR,
|
||||
DATA_DIR,
|
||||
DS_CONFIG_ZERO2_DIR,
|
||||
DS_CONFIG_ZERO3_DIR,
|
||||
)
|
||||
|
||||
is_chat = ["chat", "base"]
|
||||
docker_version = [DOCKER_VERSION_CU114, DOCKER_VERSION_CU117, DOCKER_VERSION_CU121]
|
||||
# ZeRO3 is incompatible with LoRA when finetuning on base model.
|
||||
# FSDP or ZeRO3 are incompatible with QLoRA.
|
||||
parametrize_list_none_ds = list(
|
||||
product(*[[1], ["full", "lora"], is_chat, docker_version, [None]])
|
||||
)
|
||||
parametrize_list_ds_zero2 = list(
|
||||
product(*[[2], ["full", "lora"], is_chat, docker_version, [DS_CONFIG_ZERO2_DIR]])
|
||||
)
|
||||
parametrize_list_ds_zero3 = list(
|
||||
product(*[[2], ["full"], is_chat, docker_version, [DS_CONFIG_ZERO3_DIR]])
|
||||
) + list(product(*[[2], ["lora"], ["chat"], docker_version, [DS_CONFIG_ZERO3_DIR]]))
|
||||
parametrize_list_qlora = list(
|
||||
product(*[[1, 2], ["qlora"], ["chat"], docker_version, [None, DS_CONFIG_ZERO2_DIR]])
|
||||
)
|
||||
parametrize_list = (
|
||||
parametrize_list_none_ds
|
||||
+ parametrize_list_ds_zero2
|
||||
+ parametrize_list_ds_zero3
|
||||
+ parametrize_list_qlora
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_gpus,train_type,is_chat,docker_version,deepspeed", parametrize_list
|
||||
)
|
||||
def test_finetune(num_gpus, train_type, is_chat, docker_version, deepspeed):
|
||||
cmd_docker = f"docker run --gpus all --ipc=host --network=host --rm -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {docker_version} /bin/bash -c "
|
||||
cmd = ""
|
||||
# for GPUs SM < 80
|
||||
is_ampere = torch.cuda.get_device_capability()[0] >= 8
|
||||
if not is_ampere:
|
||||
cmd = f"pip uninstall -y flash-attn && "
|
||||
|
||||
model_type = f"{MODEL_TYPE}-Chat" if is_chat == "chat" else MODEL_TYPE
|
||||
model_type = f"{model_type}-Int4" if train_type == "qlora" else model_type
|
||||
cmd += f"""torchrun --nproc_per_node {num_gpus} --nnodes 1 --node_rank 0 --master_addr localhost --master_port 12345 {DOCKER_MOUNT_DIR}/finetune.py \
|
||||
--model_name_or_path "{DOCKER_TEST_DIR}/{model_type}/" \
|
||||
--data_path {DATA_DIR} \
|
||||
--output_dir "{DOCKER_TEST_DIR}/output_qwen" \
|
||||
--num_train_epochs 1 \
|
||||
--per_device_train_batch_size 1 \
|
||||
--per_device_eval_batch_size 1 \
|
||||
--gradient_accumulation_steps 2 \
|
||||
--evaluation_strategy "no" \
|
||||
--save_strategy "steps" \
|
||||
--save_steps 1000 \
|
||||
--save_total_limit 10 \
|
||||
--learning_rate 1e-5 \
|
||||
--weight_decay 0.1 \
|
||||
--adam_beta2 0.95 \
|
||||
--warmup_ratio 0.01 \
|
||||
--lr_scheduler_type "cosine" \
|
||||
--logging_steps 1 \
|
||||
--report_to "none" \
|
||||
--model_max_length 512"""
|
||||
if deepspeed:
|
||||
cmd += f" --deepspeed {deepspeed}"
|
||||
if train_type == "lora":
|
||||
cmd += " --use_lora"
|
||||
elif train_type == "qlora":
|
||||
cmd += " --use_lora --q_lora"
|
||||
# for SM < 80
|
||||
if (
|
||||
(not is_ampere)
|
||||
and train_type == "lora"
|
||||
and (deepspeed and "zero2" in deepspeed)
|
||||
and is_chat == "base"
|
||||
):
|
||||
cmd += " --fp16 True"
|
||||
snapshot_download(model_type, cache_dir=".", revision="master")
|
||||
run_in_subprocess(cmd_docker + f'"{cmd}"')
|
||||
if train_type == "full":
|
||||
assert os.path.exists("output_qwen/config.json")
|
||||
else:
|
||||
assert os.path.exists("output_qwen/adapter_config.json")
|
||||
shutil.rmtree("output_qwen")
|
||||
78
recipes/tests/test_inference/test_inference_api.py
Normal file
78
recipes/tests/test_inference/test_inference_api.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import pytest
|
||||
import subprocess
|
||||
import torch
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/..")
|
||||
from utils import run_in_subprocess, simple_openai_api, TelnetPort
|
||||
from ut_config import (
|
||||
MODEL_TYPE,
|
||||
DOCKER_VERSION_CU114,
|
||||
DOCKER_VERSION_CU117,
|
||||
DOCKER_VERSION_CU121,
|
||||
DOCKER_MOUNT_DIR,
|
||||
DOCKER_TEST_DIR,
|
||||
)
|
||||
|
||||
|
||||
# use_cpu=True,use_int=False RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'
|
||||
# use_cpu=True,use_int4=True ValueError: Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU.You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object
|
||||
@pytest.mark.parametrize(
|
||||
"docker_version,use_cpu,use_int4",
|
||||
[
|
||||
(DOCKER_VERSION_CU114, False, False),
|
||||
(DOCKER_VERSION_CU114, False, True),
|
||||
(DOCKER_VERSION_CU117, False, False),
|
||||
(DOCKER_VERSION_CU117, False, True),
|
||||
(DOCKER_VERSION_CU121, False, False),
|
||||
(DOCKER_VERSION_CU121, False, True),
|
||||
],
|
||||
)
|
||||
def test_inference_api(docker_version, use_cpu, use_int4):
|
||||
container_name = "test_inference_api"
|
||||
model_type = f"{MODEL_TYPE}-Chat-Int4" if use_int4 else f"{MODEL_TYPE}-Chat"
|
||||
cmd_docker = f'docker run --gpus all --ipc=host --network=host --rm --name="{container_name}" -p 8000:8000 -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {docker_version} /bin/bash -c '
|
||||
cmd = ""
|
||||
# for GPUs SM < 80
|
||||
is_ampere = torch.cuda.get_device_capability()[0] >= 8
|
||||
if not is_ampere:
|
||||
cmd += f"pip uninstall -y flash-attn && "
|
||||
|
||||
cmd += f"""python {DOCKER_MOUNT_DIR}/openai_api.py -c {DOCKER_TEST_DIR}/{model_type}"""
|
||||
|
||||
if use_cpu:
|
||||
cmd += " --cpu-only"
|
||||
|
||||
snapshot_download(model_type, cache_dir=".", revision="master")
|
||||
# start model server
|
||||
print(cmd_docker + f'"{cmd}"')
|
||||
run_in_subprocess(
|
||||
f'docker rm -f {container_name} 2>/dev/null || echo "The container does not exist."'
|
||||
)
|
||||
run_in_subprocess("nohup " + cmd_docker + f'"{cmd}"' + " > tmp.log 2>&1 &")
|
||||
|
||||
while not TelnetPort("localhost", 8000):
|
||||
print("Wait for the model service start.")
|
||||
time.sleep(0.5)
|
||||
|
||||
if (
|
||||
subprocess.run(
|
||||
f"docker inspect {container_name}",
|
||||
shell=True,
|
||||
stdout=subprocess.DEVNULL,
|
||||
).returncode
|
||||
!= 0
|
||||
):
|
||||
break
|
||||
try:
|
||||
# while load int4 model such as Qwen-1_8B-Chat-Int4, the model name is Qwen-1_8B-Chat
|
||||
simple_openai_api(f"{MODEL_TYPE}-Chat".split("/")[-1])
|
||||
except Exception as e:
|
||||
time.sleep(1)
|
||||
with open("tmp.log") as f:
|
||||
raise Exception(f"{e} \n {f.read()}")
|
||||
|
||||
run_in_subprocess(f"docker rm -f {container_name}")
|
||||
73
recipes/tests/test_inference/test_inference_vllm_fschat.py
Normal file
73
recipes/tests/test_inference/test_inference_vllm_fschat.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import pytest
|
||||
import subprocess
|
||||
import torch
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/..")
|
||||
from utils import run_in_subprocess, simple_openai_api, TelnetPort
|
||||
from ut_config import (
|
||||
MODEL_TYPE,
|
||||
DOCKER_VERSION_CU121,
|
||||
DOCKER_MOUNT_DIR,
|
||||
DOCKER_TEST_DIR,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_gpus,use_int4",
|
||||
[
|
||||
(1, False),
|
||||
(1, True),
|
||||
(2, False),
|
||||
# ValueError: The input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.
|
||||
# (2, True)
|
||||
],
|
||||
)
|
||||
def test_inference_vllm_fschat(num_gpus, use_int4):
|
||||
model_type = f"{MODEL_TYPE}-Chat-Int4" if use_int4 else f"{MODEL_TYPE}-Chat"
|
||||
container_name = "test_inference_vllm_fschat"
|
||||
cmd_docker = f'docker run --gpus all --ipc=host --network=host --rm --name="{container_name}" -p 8000:8000 -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {DOCKER_VERSION_CU121} /bin/bash -c '
|
||||
cmd = ""
|
||||
|
||||
cmd += f"""nohup python -m fastchat.serve.controller > /dev/null 2>&1 \
|
||||
& python -m fastchat.serve.openai_api_server --host localhost --port 8000 > /dev/null 2>&1 \
|
||||
& python -m fastchat.serve.vllm_worker --model-path {DOCKER_TEST_DIR}/{model_type} --tensor-parallel-size {num_gpus} --trust-remote-code"""
|
||||
|
||||
# for GPUS SM < 80 and use_int==True
|
||||
is_ampere = torch.cuda.get_device_capability()[0] >= 8
|
||||
if not is_ampere or use_int4:
|
||||
cmd += " --dtype half"
|
||||
|
||||
snapshot_download(model_type, cache_dir=".", revision="master")
|
||||
# start model server
|
||||
run_in_subprocess(
|
||||
f'docker rm -f {container_name} 2>/dev/null || echo "The container does not exist."'
|
||||
)
|
||||
print(cmd_docker + f'"{cmd}"')
|
||||
run_in_subprocess("nohup " + cmd_docker + f'"{cmd}"' + " > tmp.log 2>&1 &")
|
||||
|
||||
while not TelnetPort("localhost", 21002):
|
||||
print("Wait for the model service start.")
|
||||
time.sleep(0.5)
|
||||
|
||||
if (
|
||||
subprocess.run(
|
||||
f"docker inspect {container_name}",
|
||||
shell=True,
|
||||
stdout=subprocess.DEVNULL,
|
||||
).returncode
|
||||
!= 0
|
||||
):
|
||||
break
|
||||
|
||||
try:
|
||||
simple_openai_api(model_type.split("/")[-1])
|
||||
except Exception as e:
|
||||
time.sleep(1)
|
||||
with open("tmp.log") as f:
|
||||
raise Exception(f"{e} \n {f.read()}")
|
||||
|
||||
run_in_subprocess(f"docker rm -f {container_name}")
|
||||
18
recipes/tests/ut_config.py
Normal file
18
recipes/tests/ut_config.py
Normal file
@@ -0,0 +1,18 @@
|
||||
import os
|
||||
|
||||
# common
|
||||
MODEL_TYPE = "Qwen/Qwen-1_8B"
|
||||
DOCKER_VERSION_CU114 = "qwenllm/qwen:cu114"
|
||||
DOCKER_VERSION_CU117 = "qwenllm/qwen:cu117"
|
||||
DOCKER_VERSION_CU121 = "qwenllm/qwen:cu121"
|
||||
DOCKER_MOUNT_DIR = "/qwen-recipes"
|
||||
DOCKER_TEST_DIR = os.path.join(DOCKER_MOUNT_DIR, "recipes/tests")
|
||||
|
||||
# finetune
|
||||
DATA_DIR = os.path.join(DOCKER_MOUNT_DIR, "recipes/tests/assets/test_sampled_qwen.json")
|
||||
DS_CONFIG_ZERO2_DIR = os.path.join(
|
||||
DOCKER_MOUNT_DIR, "finetune/ds_config_zero2.json"
|
||||
)
|
||||
DS_CONFIG_ZERO3_DIR = os.path.join(
|
||||
DOCKER_MOUNT_DIR, "finetune/ds_config_zero3.json"
|
||||
)
|
||||
61
recipes/tests/utils.py
Normal file
61
recipes/tests/utils.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import logging
|
||||
import subprocess
|
||||
import socket
|
||||
import openai
|
||||
|
||||
|
||||
def run_in_subprocess(cmd):
|
||||
try:
|
||||
with subprocess.Popen(
|
||||
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
) as return_info:
|
||||
while True:
|
||||
next_line = return_info.stdout.readline()
|
||||
return_line = next_line.decode("utf-8", "ignore").strip()
|
||||
if return_line == "" and return_info.poll() != None:
|
||||
break
|
||||
if return_line != "":
|
||||
logging.info(return_line)
|
||||
|
||||
err_lines = ""
|
||||
while True:
|
||||
next_line = return_info.stderr.readline()
|
||||
return_line = next_line.decode("utf-8", "ignore").strip()
|
||||
if return_line == "" and return_info.poll() != None:
|
||||
break
|
||||
if return_line != "":
|
||||
logging.info(return_line)
|
||||
err_lines += return_line + "\n"
|
||||
|
||||
return_code = return_info.wait()
|
||||
if return_code:
|
||||
raise RuntimeError(err_lines)
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
|
||||
def simple_openai_api(model):
|
||||
openai.api_base = "http://localhost:8000/v1"
|
||||
openai.api_key = "none"
|
||||
|
||||
# create a request not activating streaming response
|
||||
response = openai.ChatCompletion.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "你好"}],
|
||||
stream=False,
|
||||
stop=[], # You can add custom stop words here, e.g., stop=["Observation:"] for ReAct prompting.
|
||||
)
|
||||
print(response.choices[0].message.content)
|
||||
|
||||
|
||||
def TelnetPort(server_ip, port):
|
||||
sk = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sk.settimeout(1)
|
||||
connect_flag = False
|
||||
try:
|
||||
sk.connect((server_ip, port))
|
||||
connect_flag = True
|
||||
except Exception:
|
||||
connect_flag = False
|
||||
sk.close()
|
||||
return connect_flag
|
||||
Reference in New Issue
Block a user