update agent benchmarks and add qwen-72b results

2026-05-21 17:05:46 +08:00 · 2023-12-06 12:57:11 +08:00
parent a0a557aad8
commit 7eb9016908
7 changed files with 314 additions and 771 deletions
--- a/eval/EVALUATION.md
+++ b/eval/EVALUATION.md
@@ -85,9 +85,12 @@ This script is used to reproduce the results of the ReAct and Hugging Face Agent
 # Qwen-7B-Chat
 mkdir data;
 cd data;
-wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v1/exam_plugin_v1_react_positive.jsonl;
-wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v1/exam_plugin_v1_react_negative.jsonl;
-cd ..;
+## Old Evaluation Dataset (Version 20230803)
+# wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v1/exam_plugin_v1_react_positive.jsonl;
+# wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v1/exam_plugin_v1_react_negative.jsonl;
+## New Evaluation Dataset (Version 20231206)
+wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v20231206/exam_plugin_v20231206_react_positive.jsonl;
+wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v20231206/exam_plugin_v20231206_react_negative.jsonl;cd ..;
 pip install json5;
 pip install jsonlines;
 pip install rouge_score;