fix format problems in evaluation code; update ceval extraction rules

This commit is contained in:
feihu.hf
2023-08-25 22:44:07 +08:00
parent 1a9a04a91e
commit 4864f7b278
11 changed files with 1507 additions and 808 deletions

View File

@@ -1,14 +1,13 @@
import os
import pandas as pd
import numpy as np
import argparse
import datasets
import torch
import re
from thefuzz import process
from typing import List
import torch
import pandas as pd
from tqdm import tqdm
from thefuzz import process
from transformers.trainer_utils import set_seed
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
'''
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
@@ -22,18 +21,29 @@ python eval/evaluate_chat_mmlu.py -d data/mmlu/data/
'''
def load_models_tokenizer(args):
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True, bf16=True, use_flash_attn=True).eval()
model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
model.generation_config.do_sample = False # use greedy decoding
tokenizer = AutoTokenizer.from_pretrained(
args.checkpoint_path, trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
args.checkpoint_path,
device_map="auto",
trust_remote_code=True,
bf16=True,
use_flash_attn=True,
).eval()
model.generation_config = GenerationConfig.from_pretrained(
args.checkpoint_path, trust_remote_code=True
)
model.generation_config.do_sample = False # use greedy decoding
return model, tokenizer
def format_example(line):
example = 'The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question.\n\n' + line['question'] + "\n"
example = (
"The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question.\n\n"
+ line["question"]
+ "\n"
)
for choice in choices:
example += f'{choice}. {line[f"{choice}"]}\n'
return example
@@ -47,13 +57,20 @@ def process_before_extraction(gen, choice_dict):
gen = pattern.sub(key, gen)
return gen
def extract_choice(gen, choice_list):
# answer is A | choice is A | choose A
res = re.search(r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b", gen)
res = re.search(
r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b",
gen,
)
# A is correct | A is right
if res is None:
res = re.search(r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b", gen)
res = re.search(
r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b",
gen,
)
# straight answer: A
if res is None:
@@ -65,32 +82,37 @@ def extract_choice(gen, choice_list):
if res is None:
return choices[choice_list.index(process.extractOne(gen, choice_list)[0])]
else:
return res.group(1)
return res.group(1)
def extract_answer(response, row):
gen = process_before_extraction(response, {choice: row[choice] for choice in choices})
gen = process_before_extraction(
response, {choice: row[choice] for choice in choices}
)
pred = extract_choice(gen, [row[choice] for choice in choices])
return pred
@torch.no_grad()
def eval_subject(
model,
tokenizer,
subject_name,
test_df,
save_result_dir=None,
overwrite=False,
**kwargs
model,
tokenizer,
subject_name,
test_df,
save_result_dir=None,
overwrite=False,
**kwargs
):
result_path = os.path.join(save_result_dir, f'{subject_name}_result.csv')
result_path = os.path.join(save_result_dir, f"{subject_name}_result.csv")
if not overwrite and os.path.exists(result_path):
print(f"{result_path} existed, skip!")
score = []
for (_, datarow), (_, resultrow) in zip(test_df.iterrows(), pd.read_csv(result_path).iterrows()):
for (_, datarow), (_, resultrow) in zip(
test_df.iterrows(), pd.read_csv(result_path).iterrows()
):
# pred = extract_answer(resultrow['model_response'], datarow)
pred = resultrow['model_output']
correct = 1 if pred == datarow['answer'] else 0
pred = resultrow["model_output"]
correct = 1 if pred == datarow["answer"] else 0
score.append(correct)
return score
@@ -100,7 +122,7 @@ def eval_subject(
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = format_example(row)
response, history = model.chat(
response, _ = model.chat(
tokenizer,
question,
history=None,
@@ -111,20 +133,24 @@ def eval_subject(
print(pred)
print("======================")
if 'answer' in row:
correct = 1 if pred == row['answer'] else 0
if "answer" in row:
correct = 1 if pred == row["answer"] else 0
score.append(correct)
if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}')
if args.debug:
print(f'{question} pred: {pred} ref: {row["answer"]}')
result.append(pred)
if save_result_dir:
test_df['model_output'] = result
test_df['model_response'] = response
test_df["model_output"] = result
test_df["model_response"] = response
if score:
test_df["correctness"] = score
os.makedirs(save_result_dir, exist_ok=True)
test_df.to_csv(os.path.join(
save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False)
test_df.to_csv(
os.path.join(save_result_dir, f"{subject_name}_result.csv"),
encoding="utf-8",
index=False,
)
return score
@@ -133,15 +159,13 @@ def cal_mmlu(res):
acc_sum_dict = dict()
acc_norm_sum_dict = dict()
cnt_dict = dict()
acc_sum = 0.
acc_sum = 0.0
cnt = 0
hard_cnt = 0
hard_acc_sum = 0.
for class_ in TASK_NAME_MAPPING.keys():
acc_sum_dict[class_] = 0.
acc_norm_sum_dict[class_] = 0.
cnt_dict[class_] = 0.
acc_sum_dict[class_] = 0.0
acc_norm_sum_dict[class_] = 0.0
cnt_dict[class_] = 0.0
for tt in TASK_NAME_MAPPING[class_]:
acc_sum += sum(res[tt])
@@ -150,13 +174,12 @@ def cal_mmlu(res):
acc_sum_dict[class_] += sum(res[tt])
cnt_dict[class_] += len(res[tt])
print('\n\n\n')
print("\n\n\n")
for k in TASK_NAME_MAPPING.keys():
if k in cnt_dict:
print('%s ACC: %.2f ' % (
k, acc_sum_dict[k] * 100 / cnt_dict[k]))
print('AVERAGE ACC:%.2f ' % (acc_sum *100 / cnt))
print("%s ACC: %.2f " % (k, acc_sum_dict[k] * 100 / cnt_dict[k]))
print("AVERAGE ACC:%.2f " % (acc_sum * 100 / cnt))
def main(args):
print("loading model weights")
@@ -170,38 +193,122 @@ def main(args):
for subject_name in tqdm(SUBJECTS):
# val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv')
# dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv')
test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv')
test_file_path = os.path.join(
args.eval_data_path, "test", f"{subject_name}_test.csv"
)
# val_df = pd.read_csv(val_file_path, names=['question','A','B','C','D','answer'])
# dev_df = pd.read_csv(dev_file_path, names=['question','A','B','C','D','answer'])
test_df = pd.read_csv(test_file_path, names=['question','A','B','C','D','answer'])
test_df = pd.read_csv(
test_file_path, names=["question", "A", "B", "C", "D", "answer"]
)
score = eval_subject(model, tokenizer, subject_name, test_df, save_result_dir=f"outs_chat/mmlu_eval_result", overwrite=args.overwrite)
score = eval_subject(
model,
tokenizer,
subject_name,
test_df,
save_result_dir=f"outs_chat/mmlu_eval_result",
overwrite=args.overwrite,
)
dev_result[subject_name] = score
cal_mmlu(dev_result)
TASK_NAME_MAPPING = {'stem': ['abstract_algebra', 'anatomy', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning'],
'Humanities': ['formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions'],
'other': ['business_ethics', 'college_medicine', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology', 'global_facts', 'clinical_knowledge'],
'social': ['econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy']}
TASK_NAME_MAPPING = {
"stem": [
"abstract_algebra",
"anatomy",
"astronomy",
"college_biology",
"college_chemistry",
"college_computer_science",
"college_mathematics",
"college_physics",
"computer_security",
"conceptual_physics",
"electrical_engineering",
"elementary_mathematics",
"high_school_biology",
"high_school_chemistry",
"high_school_computer_science",
"high_school_mathematics",
"high_school_physics",
"high_school_statistics",
"machine_learning",
],
"Humanities": [
"formal_logic",
"high_school_european_history",
"high_school_us_history",
"high_school_world_history",
"international_law",
"jurisprudence",
"logical_fallacies",
"moral_disputes",
"moral_scenarios",
"philosophy",
"prehistory",
"professional_law",
"world_religions",
],
"other": [
"business_ethics",
"college_medicine",
"human_aging",
"management",
"marketing",
"medical_genetics",
"miscellaneous",
"nutrition",
"professional_accounting",
"professional_medicine",
"virology",
"global_facts",
"clinical_knowledge",
],
"social": [
"econometrics",
"high_school_geography",
"high_school_government_and_politics",
"high_school_macroeconomics",
"high_school_microeconomics",
"high_school_psychology",
"human_sexuality",
"professional_psychology",
"public_relations",
"security_studies",
"sociology",
"us_foreign_policy",
],
}
SUBJECTS = [v for vl in TASK_NAME_MAPPING.values() for v in vl]
choices = ["A", "B", "C", "D"]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Test HF checkpoint.')
parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B-Chat")
parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Test HF checkpoint.")
parser.add_argument(
"-c",
"--checkpoint-path",
type=str,
help="Checkpoint path",
default="Qwen/Qwen-7B-Chat",
)
parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed")
"""Provide extra arguments required for tasks."""
group = parser.add_argument_group(title='Evaluation options')
group.add_argument('-d', '--eval_data_path', type=str,
help='Path to eval data')
group.add_argument("--debug", action='store_true', default=False,
help='Print infos.')
group.add_argument("--overwrite", action='store_true', default=False,
help='Overwrite existed results')
# Provide extra arguments required for tasks
group = parser.add_argument_group(title="Evaluation options")
group.add_argument("-d", "--eval_data_path", type=str, help="Path to eval data")
group.add_argument(
"--debug", action="store_true", default=False, help="Print infos."
)
group.add_argument(
"--overwrite",
action="store_true",
default=False,
help="Overwrite existed results",
)
args = parser.parse_args()
set_seed(args.seed)
main(args)
main(args)