add evaluation code for Qwen-7B-Chat

2026-05-21 00:45:48 +08:00 · 2023-08-03 23:27:48 +08:00
parent 19474456d8
commit 1134e08be7
4 changed files with 716 additions and 0 deletions
--- a/eval/evaluate_chat_humaneval.py
+++ b/eval/evaluate_chat_humaneval.py
@@ -0,0 +1,82 @@
+import random
+import tqdm
+import os
+import sys
+import torch
+import jsonlines
+import argparse
+import jsonlines
+from pathlib import Path
+import re
+import textwrap
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+
+"""
+Get the HumanEval.jsonl file from [here](https://github.com/openai/human-eval/tree/master/data)
+
+python eval/evaluate_chat_humaneval.py -f HumanEval.jsonl -o HumanEval_res.jsonl
+git clone https://github.com/openai/human-eval
+pip install -e human-eval
+evaluate_functional_correctness HumanEval_res.jsonl
+"""
+
+DEVICE = "cuda:0"
+
+def extract_code(text, entry_point):
+
+    # 正则表达式匹配代码块
+    code_block_pattern = re.compile(rf"```(?:[Pp]ython\n)?.*?def\s+{entry_point}.*?:\n(.*?)\n```", re.DOTALL)
+    code_block = code_block_pattern.search(text)
+    if code_block is None:
+        code_block_pattern = re.compile(rf"def\s+{entry_point}.*?:\n(.*?)(?:\n(?!\n*(?:  |\t))|$)", re.DOTALL)
+        code_block = code_block_pattern.search(text)
+    if code_block is None:
+        code_block_pattern = re.compile(rf"def.*?:\n(.*?)(?:\n(?!\n*(?:  |\t))|$)", re.DOTALL)
+        code_block = code_block_pattern.search(text)
+
+    if code_block is not None:
+        return code_block.group(1)
+    else:
+        # if no code block is found, assume the LM is simply filling the code
+        return textwrap.indent(text, ' ' * 4)
+
+def generate_sample(model, tokenizer, question, entry_point):
+    response, history = model.chat(
+        tokenizer,
+        question,
+        history=None,
+    )
+    print(question)
+    print(response)
+    answer = extract_code(response, entry_point)
+    return answer, response
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Test HF checkpoint.')
+    parser.add_argument("-c", "--checkpoint-path", type=Path, help='Checkpoint path', default="Qwen/Qwen-7B-Chat")
+    parser.add_argument("-f","--sample-input-file", type=str, default=None, help="data path to HumanEval.jsonl")
+    parser.add_argument("-o","--sample-output-file", type=str, default="HumanEval_res.jsonl")
+
+
+    args = parser.parse_args()
+    print('Loading tokenizer ...')
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
+
+    print('Loading model ...')
+    model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True, bf16=True, use_flash_attn=True).eval()
+    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
+    model.generation_config.do_sample = False # use greedy decoding
+
+    f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8'))
+
+    f = jsonlines.open(args.sample_input_file)
+    with f_output as output:
+        for jobj in tqdm.tqdm(f, desc='task_idx'):
+            prompt = "Help me fill the following code.\n" + jobj['prompt']
+            task_id = jobj['task_id']
+            answer, response = generate_sample(model, tokenizer, prompt, jobj['entry_point'])
+            gen_jobjs = {'task_id': task_id, "completion": answer, 'response': response} 
+            output.write(gen_jobjs)
+    f_output.close()