first commit

This commit is contained in:
JustinLin610
2023-08-03 12:57:53 +08:00
commit ba2d85a13b
23 changed files with 1952 additions and 0 deletions

45
eval/EVALUATION.md Normal file
View File

@@ -0,0 +1,45 @@
## 评测复现
- CEVAL
```Shell
wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip
mkdir data/ceval
mv ceval-exam.zip data/ceval
cd data/ceval; unzip ceval-exam.zip
cd ../../
python evaluate_ceval.py -d data/ceval/
```
- MMLU
```Shell
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
mkdir data/mmlu
mv data.tar data/mmlu
cd data/mmlu; tar xf data.tar
cd ../../
python evaluate_mmlu.py -d data/mmlu/data/
```
- HumanEval
Get the HumanEval.jsonl file from [here](https://github.com/openai/human-eval/tree/master/data)
```Shell
python evaluate_humaneval.py -f HumanEval.jsonl -o HumanEval_res.jsonl
git clone https://github.com/openai/human-eval
pip install -e human-eval
evaluate_functional_correctness HumanEval_res.jsonl
```
When installing package human-eval, please note its following disclaimer:
This program exists to run untrusted model-generated code. Users are strongly encouraged not to do so outside of a robust security sandbox. The execution call in execution.py is deliberately commented out to ensure users read this disclaimer before running code in a potentially unsafe manner. See the comment in execution.py for more information and instructions.
- GSM8K
```Shell
python evaluate_gsm8k.py
```

263
eval/evaluate_ceval.py Normal file
View File

@@ -0,0 +1,263 @@
import os
import pandas as pd
import numpy as np
import argparse
import datasets
import torch
from typing import List
from tqdm import tqdm
from transformers.trainer_utils import set_seed
'''
wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip
mkdir data/ceval
mv ceval-exam.zip data/ceval
cd data/ceval; unzip ceval-exam.zip
cd ../../
python evaluate_ceval.py -d data/ceval/
'''
def load_models_tokenizer(args):
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval()
model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
return model, tokenizer
def format_example(line, include_answer=True):
example = '问题:' + line['question']
for choice in choices:
example += f'\n{choice}. {line[f"{choice}"]}'
if include_answer:
example += '\n答案:' + line["answer"] + '\n\n'
else:
example += '\n答案:'
return example
def generate_few_shot_prompt(k, subject, dev_df):
prompt = ''
if k == -1:
k = dev_df.shape[0]
for i in range(k):
prompt += format_example(
dev_df.iloc[i, :],
include_answer=True,
)
return prompt
def get_logits(tokenizer, model, inputs: List[str]):
input_ids = tokenizer(inputs, padding=False)['input_ids']
input_ids = torch.tensor(input_ids, device=model.device)
tokens = {'input_ids': input_ids}
outputs = model(input_ids)['logits']
logits = outputs[:, -1, :]
log_probs = torch.nn.functional.softmax(logits, dim=-1)
return log_probs, {'tokens': tokens}
@torch.no_grad()
def eval_subject(
model,
tokenizer,
subject_name,
test_df,
k=5,
dev_df=None,
few_shot=False,
save_result_dir=None,
**kwargs
):
result = []
score = []
few_shot_prompt = generate_few_shot_prompt(
k, subject_name, dev_df) if few_shot else []
all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []}
if args.debug: print(f"few_shot_prompt: {few_shot_prompt}")
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = format_example(row, include_answer=False)
full_prompt = few_shot_prompt + question
output, input_info = get_logits(tokenizer, model, [full_prompt])
assert output.shape[0] == 1
logits = output.flatten()
softval = torch.nn.functional.softmax(
torch.tensor(
[
logits[tokenizer("A")['input_ids']],
logits[tokenizer("B")['input_ids']],
logits[tokenizer("C")['input_ids']],
logits[tokenizer("D")['input_ids']],
]
),
dim=0,
)
if softval.dtype in {torch.bfloat16, torch.float16}:
softval = softval.to(dtype=torch.float32)
probs = softval.detach().cpu().numpy()
for i, choice in enumerate(choices):
all_probs[f'prob_{choice}'].append(probs[i])
pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
if 'answer' in row:
correct = 1 if pred == row['answer'] else 0
score.append(correct)
if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}')
result.append(pred)
if score:
correct_ratio = 100 * sum(score) / len(score)
if args.debug: print(subject_name, correct_ratio)
else:
correct_ratio = 0
if save_result_dir:
test_df['model_output'] = result
for i, choice in enumerate(choices):
test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}'])
if score:
test_df["correctness"] = score
os.makedirs(save_result_dir, exist_ok=True)
test_df.to_csv(os.path.join(
save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False)
return correct_ratio
def cal_ceval(res):
acc_sum_dict = dict()
acc_norm_sum_dict = dict()
cnt_dict = dict()
acc_sum = 0.
cnt = 0
hard_cnt = 0
hard_acc_sum = 0.
for tt in res.keys():
name = tt.split('-')[-1]
acc_sum += float(res[tt])
cnt += 1
class_ = TASK_NAME_MAPPING[name][2]
if class_ not in acc_sum_dict:
acc_sum_dict[class_] = 0.
acc_norm_sum_dict[class_] = 0.
cnt_dict[class_] = 0.
if name in hard_list:
hard_cnt += 1
hard_acc_sum += float(res[tt])
acc_sum_dict[class_] += float(res[tt])
cnt_dict[class_] += 1
print('\n\n\n')
for k in ['STEM', 'Social Science', 'Humanities', 'Other']:
if k in cnt_dict:
print('%s acc: %.2f ' % (
k, acc_sum_dict[k] / cnt_dict[k]))
if hard_cnt > 0:
print('Hard acc:%.2f ' % (hard_acc_sum / hard_cnt))
print('AVERAGE acc:%.2f ' % (acc_sum / cnt))
TASK_NAME_MAPPING = {
"computer_network": ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"],
"operating_system": ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"],
"computer_architecture": ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"],
"college_programming": ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"],
"college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"],
"college_chemistry": ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"],
"advanced_mathematics": ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"],
"probability_and_statistics": ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"],
"discrete_mathematics": ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"],
"electrical_engineer": ["Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08", "STEM"],
"metrology_engineer": ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"],
"high_school_mathematics": ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"],
"high_school_physics": ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"],
"high_school_chemistry": ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"],
"high_school_biology": ["High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"],
"middle_school_mathematics": ["Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"],
"middle_school_biology": ["Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"],
"middle_school_physics": ["Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"],
"middle_school_chemistry": ["Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"],
"veterinary_medicine": ["Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"],
"college_economics": ["College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"],
"business_administration": ["Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"],
"marxism": ["Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406", "Social Science"],
"mao_zedong_thought": ["Mao Zedong Thought", "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba", "Social Science"],
"education_science": ["Education Science", "\u6559\u80b2\u5b66", "Social Science"],
"teacher_qualification": ["Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"],
"high_school_politics": ["High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"],
"high_school_geography": ["High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"],
"middle_school_politics": ["Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"],
"middle_school_geography": ["Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"],
"modern_chinese_history": ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"],
"ideological_and_moral_cultivation": ["Ideological and Moral Cultivation", "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840", "Humanities"],
"logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"],
"law": ["Law", "\u6cd5\u5b66", "Humanities"],
"chinese_language_and_literature": ["Chinese Language and Literature", "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"],
"art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"],
"professional_tour_guide": ["Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"],
"legal_professional": ["Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c", "Humanities"],
"high_school_chinese": ["High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"],
"high_school_history": ["High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"],
"middle_school_history": ["Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"],
"civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"],
"sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"],
"plant_protection": ["Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"],
"basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"],
"clinical_medicine": ["Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"],
"urban_and_rural_planner": ["Urban and Rural Planner", "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"],
"accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"],
"fire_engineer": ["Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"],
"environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"],
"tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"],
"physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"]
}
hard_list = ['advanced_mathematics', 'discrete_mathematics', 'probability_and_statistics', 'college_physics', 'college_chemistry', 'high_school_mathematics', 'high_school_physics', 'high_school_chemistry']
choices = ["A", "B", "C", "D"]
def main(args):
model, tokenizer = load_models_tokenizer(args)
dev_result = {}
for subject_name in tqdm(TASK_NAME_MAPPING.keys()):
val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv')
dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv')
# test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv')
val_df = pd.read_csv(val_file_path)
dev_df = pd.read_csv(dev_file_path)
# test_df = pd.read_csv(test_file_path)
score = eval_subject(model, tokenizer, subject_name, val_df, dev_df=dev_df, k=5, few_shot=True,
save_result_dir=f"outs/ceval_eval_result")
dev_result[subject_name] = score
cal_ceval(dev_result)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Test HF checkpoint.')
parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B")
parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed')
"""Provide extra arguments required for tasks."""
group = parser.add_argument_group(title='Evaluation options')
group.add_argument('-d', '--eval_data_path', type=str, required=True,
help='Path to eval data')
group.add_argument("--max-seq-len", type=int, default=2048,
help='Size of the output generated text.')
group.add_argument("--debug", action='store_true', default=False,
help='Print infos.')
args = parser.parse_args()
set_seed(args.seed)
main(args)

110
eval/evaluate_gsm8k.py Normal file
View File

@@ -0,0 +1,110 @@
import random
import tqdm
import os
import re
import sys
import torch
import numpy as np
import jsonlines
import argparse
import jsonlines
import datasets
from datasets import load_from_disk,load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
INVALID_ANS = "[invalid]"
def doc_to_text(doc):
return fewshot_prompt + "\nQuestion: " + doc["question"] + "\nLet's think step by step\n"
def decode(tokens_list, tokenizer, raw_text_len):
sents = []
# print(len(tokens_list))
for tokens in tokens_list:
tokens = tokens.cpu().numpy().tolist()
sent = tokenizer.tokenizer.decode(
tokens[raw_text_len:])
sent = sent.split('<|endoftext|>')[0]
sent = sent.split('\n\n\n')[0]
sent = sent.split("\n\n")[0]
sent = sent.split("Question:")[0]
sents.append(sent)
return sents
def generate_sample(model, tokenizer, input_txt):
input_ids = tokenizer.tokenizer.encode(input_txt)
raw_text_len = len(input_ids)
context_enc = torch.tensor(
[input_ids]).to(model.device)
print(f"Input text: {input_txt}\n")
outputs = model.generate(context_enc)
output_text = decode(outputs,tokenizer,raw_text_len)[0]
print(f"\nOutput text: {output_text}\n")
return output_text
def extract_answer_hf(completion):
match = ANS_RE.search(completion)
if match:
match_str = match.group(1).strip()
match_str = match_str.replace(",", "")
return eval(match_str)
else:
return INVALID_ANS
def extract_answer(completion):
try:
last_number = re.findall(r'\d+', completion)[-1]
return eval(last_number)
except:
return INVALID_ANS
def is_correct( completion, answer):
gold = extract_answer_hf(answer)
assert gold != INVALID_ANS, "No ground truth answer found in the document."
return extract_answer(completion) == gold
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Test HF checkpoint.')
parser.add_argument("-c", "--checkpoint-path", type=str, help="Checkpoint path", default="Qwen/Qwen-7B")
parser.add_argument("-f","--sample-input-file", type=str, default=None)
parser.add_argument("-o","--sample-output-file", type=str, default="gsm8k_res.jsonl")
args = parser.parse_args()
fewshot_prompt = open("gsm8k_prompt.txt").read()
if args.sample_input_file is not None:
dataset = load_from_disk(args.sample_input_file)
else:
config = datasets.DownloadConfig(resume_download=True, max_retries=100)
dataset = load_dataset("gsm8k", 'main', download_config=config)
test = dataset["test"]
print('Loading tokenizer ...')
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
print('Loading model ...')
model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval()
model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
model.generation_config.do_sample = False
f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8'))
tot_length = test.num_rows
acc_res = []
for doc in test:
context = doc_to_text(doc)
completion = generate_sample(model, tokenizer, context)
answer= doc["answer"]
acc = is_correct(completion, answer)
doc["completion"]=completion
doc["acc"]=acc
f_output.write(doc)
acc_res.append(acc)
f_output.close()
print("Acc: ",np.mean(acc_res))

View File

@@ -0,0 +1,70 @@
import random
import tqdm
import os
import sys
import torch
import jsonlines
import argparse
import jsonlines
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
"""
git clone https://github.com/openai/human-eval
$ pip install -e human-eval
evaluate_functional_correctness sample-output-file
"""
def decode(tokens_list, tokenizer, raw_text_len):
sents = []
# print(len(tokens_list))
for tokens in tokens_list:
tokens = tokens.cpu().numpy().tolist()
sent = tokenizer.tokenizer.decode(
tokens[raw_text_len:])
sent = sent.split('<|endoftext|>')[0]
sent = sent.split('\n\n\n')[0]
sent = sent.split("\n\n")[0]
sent = sent.split("def ")[0]
sents.append(sent)
return sents
def generate_sample(model, tokenizer, input_txt):
input_ids = tokenizer.tokenizer.encode(input_txt)
raw_text_len = len(input_ids)
context_enc = torch.tensor([input_ids] ).to(model.device)
print(f"Input text: {input_txt}\n")
outputs = model.generate(context_enc)
output_text = decode(outputs,tokenizer,raw_text_len)[0]
print(f"\nOutput text: \n{output_text}\n")
return output_text
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Test HF checkpoint.')
parser.add_argument("-c", "--checkpoint-path", type=str, help='Checkpoint path', default="Qwen/Qwen-7B")
parser.add_argument("-f","--sample-input-file", type=str, default=None, help="data path to HumanEval.jsonl")
parser.add_argument("-o","--sample-output-file", type=str, default="HumanEval_res.jsonl")
args = parser.parse_args()
print('Loading tokenizer ...')
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
print('Loading model ...')
model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval()
model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
model.generation_config.do_sample = False
f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8'))
f = jsonlines.open(args.sample_input_file)
with f_output as output:
for jobj in tqdm.tqdm(f, desc='task_idx'):
prompt = jobj['prompt']
task_id = jobj['task_id']
gen_sents = generate_sample(model, tokenizer, prompt)
gen_jobjs = {'task_id': task_id, "completion": gen_sents}
output.write(gen_jobjs)
f_output.close()

218
eval/evaluate_mmlu.py Normal file
View File

@@ -0,0 +1,218 @@
import os
import pandas as pd
import numpy as np
import argparse
import datasets
import torch
from typing import List
from tqdm import tqdm
from transformers.trainer_utils import set_seed
'''
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
mkdir data/mmlu
mv data.tar data/mmlu
cd data/mmlu; tar xf data.tar
cd ../../
python eval/evaluate_mmlu.py -d data/mmlu/data/
'''
def load_models_tokenizer(args):
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval()
model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
return model, tokenizer
def format_example(line, include_answer=True):
example = 'Question: ' + line['question']
for choice in choices:
example += f'\n{choice}. {line[f"{choice}"]}'
if include_answer:
example += '\nAnswer: ' + line["answer"] + '\n\n'
else:
example += '\nAnswer:'
return example
def generate_few_shot_prompt(k, subject, dev_df):
def format_subject(subject):
l = subject.split("_")
s = ""
for entry in l:
s += " " + entry
return s.strip()
prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(format_subject(subject))
if k == -1:
k = dev_df.shape[0]
for i in range(k):
prompt += format_example(
dev_df.iloc[i, :],
include_answer=True,
)
return prompt
def get_logits(tokenizer, model, inputs: List[str]):
input_ids = tokenizer(inputs, padding=False)['input_ids']
input_ids = torch.tensor(input_ids, device=model.device)
if input_ids.shape[1] > args.max_seq_len:
input_ids = input_ids[:, input_ids.shape[1]-args.max_seq_len+1:]
tokens = {'input_ids': input_ids}
outputs = model(input_ids)['logits']
logits = outputs[:, -1, :]
log_probs = torch.nn.functional.softmax(logits, dim=-1)
return log_probs, {'tokens': tokens}
@torch.no_grad()
def eval_subject(
model,
tokenizer,
subject_name,
test_df,
k=5,
dev_df=None,
few_shot=False,
save_result_dir=None,
**kwargs
):
result = []
score = []
few_shot_prompt = generate_few_shot_prompt(
k, subject_name, dev_df) if few_shot else []
all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []}
if args.debug: print(f"few_shot_prompt: {few_shot_prompt}")
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = format_example(row, include_answer=False)
full_prompt = few_shot_prompt + question
output, input_info = get_logits(tokenizer, model, [full_prompt])
assert output.shape[0] == 1
logits = output.flatten()
softval = torch.nn.functional.softmax(
torch.tensor(
[
logits[tokenizer(" A")['input_ids']],
logits[tokenizer(" B")['input_ids']],
logits[tokenizer(" C")['input_ids']],
logits[tokenizer(" D")['input_ids']],
]
),
dim=0,
)
if softval.dtype in {torch.bfloat16, torch.float16}:
softval = softval.to(dtype=torch.float32)
probs = softval.detach().cpu().numpy()
for i, choice in enumerate(choices):
all_probs[f'prob_{choice}'].append(probs[i])
pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
if 'answer' in row:
correct = 1 if pred == row['answer'] else 0
score.append(correct)
if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}')
result.append(pred)
if save_result_dir:
test_df['model_output'] = result
for i, choice in enumerate(choices):
test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}'])
if score:
test_df["correctness"] = score
os.makedirs(save_result_dir, exist_ok=True)
test_df.to_csv(os.path.join(
save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False)
return score
def cal_mmlu(res):
acc_sum_dict = dict()
acc_norm_sum_dict = dict()
cnt_dict = dict()
acc_sum = 0.
cnt = 0
hard_cnt = 0
hard_acc_sum = 0.
for class_ in TASK_NAME_MAPPING.keys():
acc_sum_dict[class_] = 0.
acc_norm_sum_dict[class_] = 0.
cnt_dict[class_] = 0.
for tt in TASK_NAME_MAPPING[class_]:
acc_sum += sum(res[tt])
cnt += len(res[tt])
acc_sum_dict[class_] += sum(res[tt])
cnt_dict[class_] += len(res[tt])
print('\n\n\n', 'total cnt:', cnt, '\n')
for k in TASK_NAME_MAPPING.keys():
if k in cnt_dict:
print('%s ACC: %.2f ' % (
k, acc_sum_dict[k] / cnt_dict[k] * 100))
print('AVERAGE ACC:%.2f ' % (acc_sum / cnt * 100))
def main(args):
model, tokenizer = load_models_tokenizer(args)
dev_result = {}
for subject_name in tqdm(SUBJECTS):
# val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv')
dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv')
test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv')
# val_df = pd.read_csv(val_file_path, names=['question','A','B','C','D','answer'])
dev_df = pd.read_csv(dev_file_path, names=['question','A','B','C','D','answer'])
test_df = pd.read_csv(test_file_path, names=['question','A','B','C','D','answer'])
score = eval_subject(model, tokenizer, subject_name, test_df, dev_df=dev_df, k=5, few_shot=True,
save_result_dir=f"outs/mmlu_eval_result")
dev_result[subject_name] = score
cal_mmlu(dev_result)
TASK_NAME_MAPPING = {'stem': ['abstract_algebra', 'anatomy', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning'],
'Humanities': ['formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions'],
'other': ['business_ethics', 'college_medicine', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology', 'global_facts', 'clinical_knowledge'],
'social': ['econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy']}
SUBJECTS = [v for vl in TASK_NAME_MAPPING.values() for v in vl]
choices = ["A", "B", "C", "D"]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Test HF checkpoint.')
parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B")
parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed')
parser.add_argument('--gpu', type=int, default=0, help='gpu id')
"""Provide extra arguments required for tasks."""
group = parser.add_argument_group(title='Evaluation options')
group.add_argument('-d', '--eval_data_path', type=str,
help='Path to eval data')
group.add_argument("--max-seq-len", type=int, default=2048,
help='Size of the output generated text.')
group.add_argument("--debug", action='store_true', default=False,
help='Print infos.')
args = parser.parse_args()
set_seed(args.seed)
main(args)

59
eval/gsm8k_prompt.txt Normal file
View File

@@ -0,0 +1,59 @@
Question: In 2004, there were 60 kids at a cookout. In 2005, half the number of kids came to the cookout as compared to 2004. In 2006, 2/3 as many kids came to the cookout as in 2005. How many kids came to the cookout in 2006?
Let's think step by step
In 2005, 60/2=30 kids came to the cookout.
In 2006, 30/3*2=20 kids came to the cookout.
The answer is 20
Question: Zilla spent 7% of her monthly earnings on rent, half of it on her other monthly expenses, and put the rest in her savings. If she spent $133 on her rent, how much does she deposit into her savings account in a month?
Let's think step by step
Since $133 is equal to 7% of her earnings, then 1% is equal to $133/7 = $19.
The total monthly earning of Zilla is represented by 100%, so $19 x 100 = $1900 is her monthly earnings.
So, $1900/2 = $950 is spent on her other monthly expenses.
The total amount spent on the rent and other monthly expenses is $133 + $950 = $1083.
Hence, she saves $1900 - $1083 = $817 per month.
The answer is 817
Question: If Buzz bought a pizza with 78 slices at a restaurant and then decided to share it with the waiter in the ratio of 5:8, with Buzz's ratio being 5, what's twenty less the number of slices of pizza that the waiter ate?
Let's think step by step
The total ratio representing the slices of pizza that Buzz bought is 5+8=13
If he shared the slices of pizza with the waiter, the waiter received a fraction of 8/13 of the total number of slices, which totals 8/13 * 78 = 48 slices
Twenty less the number of slices of pizza that the waiter ate is 48-20 = 28
The answer is 28
Question: Jame gets a raise to $20 per hour and works 40 hours a week. His old job was $16 an hour for 25 hours per week. How much more money does he make per year in his new job than the old job if he works 52 weeks a year?
Let's think step by step
He makes 20*40=$800 per week
He used to make 16*25=$400 per week
So his raise was 800-400=$400 per week
So he makes 400*52=$20,800 per year more
The answer is 20800
Question: Mr. Gardner bakes 20 cookies, 25 cupcakes, and 35 brownies for his second-grade class of 20 students. If he wants to give each student an equal amount of sweet treats, how many sweet treats will each student receive?
Let's think step by step
Mr. Gardner bakes a total of 20 + 25 + 35 = 80 sweet treats
Each student will receive 80 / 20 = 4 sweet treats
The answer is 4
Question: A used car lot has 24 cars and motorcycles (in total) for sale. A third of the vehicles are motorcycles, and a quarter of the cars have a spare tire included. How many tires are on the used car lots vehicles in all?
Let's think step by step
The used car lot has 24 / 3 = 8 motorcycles with 2 tires each.
The lot has 24 - 8 = 16 cars for sale
There are 16 / 4 = 4 cars with a spare tire with 5 tires each.
The lot has 16 - 4 = 12 cars with 4 tires each.
Thus, the used car lots vehicles have 8 * 2 + 4 * 5 + 12 * 4 = 16 + 20 + 48 = 84 tires in all.
The answer is 84
Question: Norma takes her clothes to the laundry. She leaves 9 T-shirts and twice as many sweaters as T-shirts in the washer. When she returns she finds 3 sweaters and triple the number of T-shirts. How many items are missing?
Let's think step by step
Norma left 9 T-shirts And twice as many sweaters, she took 9 * 2= 18 sweaters
Adding the T-shirts and sweaters, Norma left 9 + 18 = 27 clothes
When she came back, she found 3 sweaters And triple the number of T-shirts, she found 3 * 3 = 9 T-shirts
Adding the T-shirts and sweaters, Norma found 3 + 9 = 12 clothes
Subtracting the clothes she left from the clothes she found, 27 - 12 = 15 clothes are missing
The answer is 15
Question: Adam has an orchard. Every day for 30 days he picks 4 apples from his orchard. After a month, Adam has collected all the remaining apples, which were 230. How many apples in total has Adam collected from his orchard?
Let's think step by step
During 30 days Adam picked 4 * 30 = 120 apples.
So in total with all the remaining apples, he picked 120 + 230 = 350 apples from his orchard.
The answer is 350