fix format problems in evaluation code; update ceval extraction rules

This commit is contained in:
feihu.hf
2023-08-25 22:44:07 +08:00
parent 1a9a04a91e
commit 4864f7b278
11 changed files with 1507 additions and 808 deletions

View File

@@ -12,47 +12,48 @@ from transformers.generation import GenerationConfig
from transformers.tools.evaluate_agent import evaluate_agent
from transformers.trainer_utils import set_seed
data_root_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'data')
data_root_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
def is_callable(response, golden):
return response['action'].strip().lower() == golden['action'].strip(
).lower()
return response["action"].strip().lower() == golden["action"].strip().lower()
def process_res(response):
# parse response
response += '\n' # fix not-find bug
thought = response[:response.find('Action:')].strip()
action = response[response.find('Action:') +
len('Action:'):response.find('Action Input:')].strip()
action_input = response[response.find('Action Input:') +
len('Action Input:'):response.find('Observation:'
)].strip()
#TODO: This parsing result is incorrect if the response contains multiple Actions. To be fixed in the future.
observation = response[response.find('Observation:') +
len('Observation:'):response.rfind('Thought:'
)].strip()
thought_last = response[response.rfind('Thought:') +
len('Thought:'):response.find('Final Answer:'
)].strip()
final_answer = response[response.find('Final Answer:') +
len('Final Answer:'):].strip()
response += "\n" # fix not-find bug
thought = response[: response.find("Action:")].strip()
action = response[
response.find("Action:") + len("Action:") : response.find("Action Input:")
].strip()
action_input = response[
response.find("Action Input:")
+ len("Action Input:") : response.find("Observation:")
].strip()
# TODO: This parsing result is incorrect if the response contains multiple Actions. To be fixed in the future.
observation = response[
response.find("Observation:") + len("Observation:") : response.rfind("Thought:")
].strip()
thought_last = response[
response.rfind("Thought:") + len("Thought:") : response.find("Final Answer:")
].strip()
final_answer = response[
response.find("Final Answer:") + len("Final Answer:") :
].strip()
try:
action_input = json.dumps(json5.loads(action_input),
ensure_ascii=False,
sort_keys=True)
action_input = json.dumps(
json5.loads(action_input), ensure_ascii=False, sort_keys=True
)
except:
# print("JSON Load Error:", action_input)
pass
res_dict = {
'thought': thought,
'action': action,
'action_input': action_input,
'observation': observation,
'thought_last': thought_last,
'final_answer': final_answer
"thought": thought,
"action": action,
"action_input": action_input,
"observation": observation,
"thought_last": thought_last,
"final_answer": final_answer,
}
return res_dict
@@ -68,20 +69,18 @@ def _get_tokenized_string(tokenizer, text_list):
assert tokenizer is not None
token_ids = tokenizer.encode(text)
tokens_bytes = tokenizer.convert_ids_to_tokens(token_ids)
tokens = [
token.decode('utf-8', errors='replace') for token in tokens_bytes
]
tokenized_string = ' '.join(tokens)
tokens = [token.decode("utf-8", errors="replace") for token in tokens_bytes]
tokenized_string = " ".join(tokens)
token_ids_list.append(token_ids)
tokenized_string_list.append(tokenized_string)
return token_ids_list, tokenized_string_list
def eval_action(job):
response = job['gen'][0]
golden = job['response']
response = job["gen"][0]
golden = job["response"]
if 'Action:' in response:
if "Action:" in response:
response, golden = process_res(response), process_res(golden)
if is_callable(response, golden):
return True
@@ -89,26 +88,29 @@ def eval_action(job):
def eval_action_input(job, tokenizer):
response = job['gen'][0]
golden = job['response']
response = job["gen"][0]
golden = job["response"]
response, golden = process_res(response), process_res(golden)
query = job['prompt']
query = job["prompt"]
job = {}
job['prompt'] = query
job['gen'] = response['action_input']
job['response'] = golden['action_input']
job["prompt"] = query
job["gen"] = response["action_input"]
job["response"] = golden["action_input"]
job['_gen_tok'], job['_gen_tok_str'] = _get_tokenized_string(
tokenizer, [response['action_input']])
job['_reference_tok'], job['_reference_tok_str'] = _get_tokenized_string(
tokenizer, [golden['action_input']])
job["_gen_tok"], job["_gen_tok_str"] = _get_tokenized_string(
tokenizer, [response["action_input"]]
)
job["_reference_tok"], job["_reference_tok_str"] = _get_tokenized_string(
tokenizer, [golden["action_input"]]
)
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],
tokenizer=_DummyTokenizer())
score = scorer.score(job['_reference_tok_str'][0], job['_gen_tok_str'][0])
scorer = rouge_scorer.RougeScorer(
["rouge1", "rouge2", "rougeL"], tokenizer=_DummyTokenizer()
)
score = scorer.score(job["_reference_tok_str"][0], job["_gen_tok_str"][0])
rouge = score['rougeL'].fmeasure
rouge = score["rougeL"].fmeasure
return rouge
@@ -124,24 +126,33 @@ class QWenAgent(Agent):
agent.run("Draw me a picture of rivers and lakes.")
```
"""
def __init__(self,
chat_prompt_template=None,
run_prompt_template=None,
additional_tools=None,
tokenizer=None,
model=None):
def __init__(
self,
chat_prompt_template=None,
run_prompt_template=None,
additional_tools=None,
tokenizer=None,
model=None,
):
if tokenizer and model:
self.tokenizer = tokenizer
self.model = model
else:
checkpoint = 'Qwen/Qwen-7B-Chat'
checkpoint = "Qwen/Qwen-7B-Chat"
self.tokenizer = AutoTokenizer.from_pretrained(
checkpoint, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
checkpoint, device_map='auto',
trust_remote_code=True).cuda().eval()
checkpoint, trust_remote_code=True
)
self.model = (
AutoModelForCausalLM.from_pretrained(
checkpoint, device_map="auto", trust_remote_code=True
)
.cuda()
.eval()
)
self.model.generation_config = GenerationConfig.from_pretrained(
checkpoint, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
checkpoint, trust_remote_code=True
) # 可指定不同的生成长度、top_p等相关超参
self.model.generation_config.do_sample = False # greedy
super().__init__(
@@ -152,155 +163,161 @@ class QWenAgent(Agent):
def generate_one(self, prompt, stop):
# "Human:" 和 "Assistant:" 曾为通义千问的特殊保留字,需要替换为 "_HUMAN_:" 和 "_ASSISTANT_:"。这一问题将在未来版本修复。
prompt = prompt.replace('Human:',
'_HUMAN_:').replace('Assistant:',
'_ASSISTANT_:')
prompt = prompt.replace("Human:", "_HUMAN_:").replace(
"Assistant:", "_ASSISTANT_:"
)
stop = [
item.replace('Human:', '_HUMAN_:').replace('Assistant:',
'_ASSISTANT_:')
item.replace("Human:", "_HUMAN_:").replace("Assistant:", "_ASSISTANT_:")
for item in stop
]
result, _ = self.model.chat(self.tokenizer, prompt, history=None)
for stop_seq in stop:
if result.endswith(stop_seq):
result = result[:-len(stop_seq)]
result = result[: -len(stop_seq)]
result = result.replace('_HUMAN_:',
'Human:').replace('_ASSISTANT_:', 'Assistant:')
result = result.replace("_HUMAN_:", "Human:").replace(
"_ASSISTANT_:", "Assistant:"
)
return result
def load_models_tokenizer(args):
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path,
trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path,
device_map='auto',
trust_remote_code=True,
bf16=True,
use_flash_attn=True).eval()
tokenizer = AutoTokenizer.from_pretrained(
args.checkpoint_path, trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
args.checkpoint_path,
device_map="auto",
trust_remote_code=True,
bf16=True,
use_flash_attn=True,
).eval()
model.generation_config = GenerationConfig.from_pretrained(
args.checkpoint_path, trust_remote_code=True)
args.checkpoint_path, trust_remote_code=True
)
model.generation_config.do_sample = False # use greedy decoding
return model, tokenizer
def load_jobs(filename):
jobs = []
with jsonlines.open(os.path.join(data_root_path, filename),
mode='r') as reader:
with jsonlines.open(os.path.join(data_root_path, filename), mode="r") as reader:
for job in reader:
jobs.append(job)
return jobs
def react_inference(filename, model, tokenizer):
filename_cache = filename + '.cache'
filename_cache = filename + ".cache"
if os.path.exists(os.path.join(data_root_path, filename_cache)):
jobs = load_jobs(filename=filename_cache)
print('Loaded from', filename_cache)
print("Loaded from", filename_cache)
else:
with open(os.path.join(data_root_path, filename_cache), 'w') as f:
with open(os.path.join(data_root_path, filename_cache), "w") as f:
jobs = load_jobs(filename=filename)
print('Inference:', filename)
print("Inference:", filename)
for job in tqdm(jobs):
response, history = model.chat(tokenizer,
job['prompt'],
history=None)
job['gen'] = [response]
f.writelines(json.dumps(job, ensure_ascii=False) + '\n')
print(filename_cache, 'is saved.')
response, history = model.chat(tokenizer, job["prompt"], history=None)
job["gen"] = [response]
f.writelines(json.dumps(job, ensure_ascii=False) + "\n")
print(filename_cache, "is saved.")
return jobs
def main(args):
print('loading model weights')
print("loading model weights")
if args.checkpoint_path is not None:
model, tokenizer = load_models_tokenizer(args)
else:
model, tokenizer = None, None
print('model loaded')
print("model loaded")
result = {}
# eval react positive
if args.eval_react_positive:
print('eval react positive ...')
print("eval react positive ...")
acc_count = 0
rouge_mean = 0
jobs = react_inference(filename=args.eval_react_positive_filename,
model=model,
tokenizer=tokenizer)
jobs = react_inference(
filename=args.eval_react_positive_filename, model=model, tokenizer=tokenizer
)
for job in jobs:
if eval_action(job):
acc_count += 1
rouge = eval_action_input(job, tokenizer)
rouge_mean += (rouge / len(jobs))
rouge_mean += rouge / len(jobs)
scores = {
'action_right_rate': acc_count / len(jobs),
'action_input_rouge': rouge_mean,
"action_right_rate": acc_count / len(jobs),
"action_input_rouge": rouge_mean,
}
result.update({'react_positive': scores})
result.update({"react_positive": scores})
# eval react negative
if args.eval_react_negative:
print('eval react negative ...')
print("eval react negative ...")
bad_count = 0
jobs = react_inference(filename=args.eval_react_negative_filename,
model=model,
tokenizer=tokenizer)
jobs = react_inference(
filename=args.eval_react_negative_filename, model=model, tokenizer=tokenizer
)
for job in jobs:
if '\nAction:' in job['gen'][0]:
if "\nAction:" in job["gen"][0]:
bad_count += 1
scores = {'bad_rate': bad_count / len(jobs)}
result.update({'react_negative': scores})
scores = {"bad_rate": bad_count / len(jobs)}
result.update({"react_negative": scores})
# eval hfagent
if args.eval_hfagent:
print('eval hfagent ...')
print("eval hfagent ...")
agent = QWenAgent(model=model, tokenizer=tokenizer)
scores = evaluate_agent(agent, verbose=False, return_errors=False)
result.update({'hfagent': scores})
result.update({"hfagent": scores})
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(result)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Test HF checkpoint.')
parser.add_argument('-c',
'--checkpoint-path',
type=str,
help='Checkpoint path',
default='Qwen/Qwen-7B-Chat')
parser.add_argument('-s',
'--seed',
type=int,
default=1234,
help='Random seed')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Test HF checkpoint.")
parser.add_argument(
"-c",
"--checkpoint-path",
type=str,
help="Checkpoint path",
default="Qwen/Qwen-7B-Chat",
)
parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed")
"""Provide extra arguments required for tasks."""
group = parser.add_argument_group(title='Evaluation options')
group.add_argument('--eval-react-positive',
action='store_true',
default=False,
help='Eval react positive.')
group.add_argument('--eval-react-positive-filename',
type=str,
default='exam_plugin_v1_react_positive.jsonl',
help='Eval react positive filename.')
group.add_argument('--eval-react-negative',
action='store_true',
default=False,
help='Eval react negative.')
group.add_argument('--eval-react-negative-filename',
type=str,
default='exam_plugin_v1_react_negative.jsonl',
help='Eval react negative filename.')
group.add_argument('--eval-hfagent',
action='store_true',
default=False,
help='Eval hfagent.')
group = parser.add_argument_group(title="Evaluation options")
group.add_argument(
"--eval-react-positive",
action="store_true",
default=False,
help="Eval react positive.",
)
group.add_argument(
"--eval-react-positive-filename",
type=str,
default="exam_plugin_v1_react_positive.jsonl",
help="Eval react positive filename.",
)
group.add_argument(
"--eval-react-negative",
action="store_true",
default=False,
help="Eval react negative.",
)
group.add_argument(
"--eval-react-negative-filename",
type=str,
default="exam_plugin_v1_react_negative.jsonl",
help="Eval react negative filename.",
)
group.add_argument(
"--eval-hfagent", action="store_true", default=False, help="Eval hfagent."
)
args = parser.parse_args()
set_seed(args.seed)