release latest models

2026-05-20 16:35:47 +08:00 · 2023-09-25 10:41:59 +08:00
parent fb3180d8f0
commit fc57dea277
13 changed files with 938 additions and 235 deletions
--- a/eval/EVALUATION.md
+++ b/eval/EVALUATION.md
@@ -12,7 +12,7 @@ cd ../../
 # Qwen-7B
 python evaluate_ceval.py -d data/ceval/

-# Qwen-7B-Chat
+# Qwen-7B-Chat (We only provide 0-shot reproduction scripts. 5-shot results are obtained by OpenCompass (https://github.com/InternLM/opencompass).)
 pip install thefuzz
 python evaluate_chat_ceval.py -d data/ceval/
 ```
@@ -29,7 +29,7 @@ cd ../../
 # Qwen-7B
 python evaluate_mmlu.py -d data/mmlu/data/

-# Qwen-7B-Chat
+# Qwen-7B-Chat (We only provide 0-shot reproduction scripts. 5-shot results are obtained by OpenCompass (https://github.com/InternLM/opencompass).)
 pip install thefuzz
 python evaluate_chat_mmlu.py -d data/mmlu/data/
 ```
@@ -73,9 +73,8 @@ This program exists to run untrusted model-generated code. Users are strongly en
 # Qwen-7B
 python evaluate_gsm8k.py

-# Qwen-7B-Chat
+# Qwen-7B-Chat (We only provide 0-shot reproduction scripts. 5-shot results are obtained by OpenCompass (https://github.com/InternLM/opencompass).)
 python evaluate_chat_gsm8k.py # zeroshot
-python evaluate_chat_gsm8k.py --use-fewshot # fewshot
 ```

 - PLUGIN
--- a/eval/evaluate_chat_humaneval.py
+++ b/eval/evaluate_chat_humaneval.py
@@ -1,4 +1,3 @@
-
 import re
 import textwrap
 import argparse
@@ -19,6 +18,7 @@ evaluate_functional_correctness HumanEval_res.jsonl

 DEVICE = "cuda:0"

+
 def extract_code(text, entry_point):
    # 正则表达式匹配代码块
    code_block_pattern = re.compile(
@@ -99,7 +99,26 @@ if __name__ == "__main__":
    f = jsonlines.open(args.sample_input_file)
    with f_output as output:
        for jobj in tqdm.tqdm(f, desc="task_idx"):
-            prompt = "Help me fill the following code.\n" + jobj["prompt"]
+            # use humanevalpack prompt
+            signature = re.search(
+                rf"def\s+({jobj['entry_point']}.*?):\s*\n", jobj["prompt"]
+            ).group(1)
+            description = "\n".join(
+                [
+                    line.strip()
+                    for line in re.search(
+                        rf"(?:\"\"\"|''')(.*?)(?:\"\"\"|''')", jobj["prompt"], re.DOTALL
+                    )
+                    .group(1)
+                    .split("\n")
+                ]
+            )
+            prompt = (
+                f"Write a Python function `{signature}` to solve the following problem:\n"
+                f"{description}\n"
+                f"{jobj['prompt']}"
+            )
+
            task_id = jobj["task_id"]
            answer, response = generate_sample(
                model, tokenizer, prompt, jobj["entry_point"]