大规模多任务语言理解评估基准,衡量LLM跨学科知识掌握能力
# MMLU学科分类体系
MMLU_SUBJECTS = {
"人文学科": [
"formal_logic", # 形式逻辑
"logical_fallacies", # 逻辑谬误
"moral_disputes", # 道德争议
"moral_scenarios", # 道德情景
"philosophy", # 哲学
"prehistory", # 史前史
"professional_psychology", # 专业心理学
"world_religions" # 世界宗教
],
"社会科学": [
"econometrics", # 计量经济学
"high_school_geography", # 高中地理
"high_school_government_and_politics", # 高中政治
"high_school_macroeconomics", # 高中宏观经济学
"high_school_microeconomics", # 高中微观经济学
"high_school_psychology", # 高中心理学
"human_sexuality", # 人类性学
"international_law", # 国际法
"jurisprudence", # 法理学
"professional_law", # 专业法律
"public_relations", # 公共关系
"security_studies", # 安全研究
"sociology", # 社会学
"us_foreign_policy" # 美国对外政策
],
"STEM理工": [
"abstract_algebra", # 抽象代数
"anatomy", # 解剖学
"astronomy", # 天文学
"college_biology", # 大学生物学
"college_chemistry", # 大学化学
"college_computer_science", # 大学计算机科学
"college_mathematics", # 大学数学
"college_physics", # 大学物理
"conceptual_physics", # 概念物理
"electrical_engineering", # 电气工程
"elementary_mathematics", # 初等数学
"high_school_biology", # 高中生物
"high_school_chemistry", # 高中化学
"high_school_computer_science", # 高中计算机科学
"high_school_mathematics", # 高中数学
"high_school_physics", # 高中物理
"high_school_statistics", # 高中统计
"machine_learning", # 机器学习
"nutrition", # 营养学
"professional_medicine", # 专业医学
"virology" # 病毒学
],
"其他专业": [
"business_ethics", # 商业伦理
"clinical_knowledge", # 临床知识
"college_medicine", # 大学医学
"global_facts", # 全球事实
"human_aging", # 人类衰老
"management", # 管理学
"marketing", # 市场营销
"medical_genetics", # 医学遗传学
"miscellaneous", # 杂项
"professional_accounting" # 专业会计
]
}
def analyze_mmlu_distribution():
"""分析MMLU数据分布"""
total_subjects = sum(len(subjects) for subjects in MMLU_SUBJECTS.values())
distribution = {}
for category, subjects in MMLU_SUBJECTS.items():
distribution[category] = {
"count": len(subjects),
"percentage": len(subjects) / total_subjects * 100,
"example_subjects": subjects[:3] # 显示前3个示例
}
return distribution
# 使用示例
distribution = analyze_mmlu_distribution()
for category, info in distribution.items():
print(f"{category}: {info['count']}个学科 ({info['percentage']:.1f}%)")
class MMLUDifficultyAnalyzer:
def __init__(self):
self.difficulty_mapping = {
"high_school": "高中水平",
"college": "大学水平",
"professional": "专业水平",
"elementary": "基础水平",
"advanced": "高级水平"
}
def categorize_by_difficulty(self, subject: str) -> str:
"""根据学科名称判断难度等级"""
if "high_school" in subject:
return "高中水平"
elif "college" in subject:
return "大学水平"
elif "professional" in subject:
return "专业水平"
elif "elementary" in subject:
return "基础水平"
else:
return "高级水平"
def get_difficulty_distribution(self) -> dict:
"""获取难度分布统计"""
all_subjects = []
for subjects in MMLU_SUBJECTS.values():
all_subjects.extend(subjects)
difficulty_count = {}
for subject in all_subjects:
difficulty = self.categorize_by_difficulty(subject)
difficulty_count[difficulty] = difficulty_count.get(difficulty, 0) + 1
total = len(all_subjects)
return {
level: {"count": count, "percentage": count/total*100}
for level, count in difficulty_count.items()
}
analyzer = MMLUDifficultyAnalyzer()
difficulty_stats = analyzer.get_difficulty_distribution()
import json
import random
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass
@dataclass
class MMLUQuestion:
subject: str
question: str
choices: List[str]
answer: str # A, B, C, D中的一个
class MMLUEvaluator:
def __init__(self, data_path: str):
self.data_path = data_path
self.load_data()
self.results = {}
def load_data(self):
"""加载MMLU数据集"""
self.test_data = {}
# 加载各学科测试数据
for category, subjects in MMLU_SUBJECTS.items():
for subject in subjects:
try:
with open(f"{self.data_path}/{subject}_test.csv", 'r') as f:
questions = []
for line in f:
parts = line.strip().split(',')
if len(parts) >= 6:
question = MMLUQuestion(
subject=subject,
question=parts[0],
choices=[parts[1], parts[2], parts[3], parts[4]],
answer=parts[5]
)
questions.append(question)
self.test_data[subject] = questions
except FileNotFoundError:
print(f"Warning: {subject} data not found")
def evaluate_model(self, model_inference_func, shots: int = 5) -> Dict[str, Any]:
"""评估模型性能"""
all_results = {}
for subject, questions in self.test_data.items():
print(f"评估学科: {subject}")
# Few-shot示例构建
few_shot_examples = self.build_few_shot_examples(subject, shots)
subject_results = []
for question in questions:
# 构建提示词
prompt = self.build_evaluation_prompt(few_shot_examples, question)
# 模型推理
try:
model_answer = model_inference_func(prompt)
predicted_choice = self.extract_choice(model_answer)
is_correct = predicted_choice == question.answer
subject_results.append({
"question": question.question,
"correct_answer": question.answer,
"predicted_answer": predicted_choice,
"is_correct": is_correct,
"model_response": model_answer
})
except Exception as e:
print(f"Error evaluating question: {e}")
subject_results.append({
"question": question.question,
"correct_answer": question.answer,
"predicted_answer": "ERROR",
"is_correct": False,
"error": str(e)
})
# 计算学科准确率
correct_count = sum(1 for r in subject_results if r["is_correct"])
accuracy = correct_count / len(subject_results) if subject_results else 0
all_results[subject] = {
"accuracy": accuracy,
"correct": correct_count,
"total": len(subject_results),
"details": subject_results
}
print(f"{subject} 准确率: {accuracy:.3f} ({correct_count}/{len(subject_results)})")
# 计算总体统计
overall_stats = self.calculate_overall_stats(all_results)
return {
"subject_results": all_results,
"overall_stats": overall_stats,
"metadata": {
"total_subjects": len(all_results),
"evaluation_method": f"{shots}-shot",
"timestamp": datetime.now().isoformat()
}
}
def build_few_shot_examples(self, subject: str, num_shots: int) -> List[MMLUQuestion]:
"""构建Few-shot示例"""
# 从dev集或train集中随机选择示例(这里简化为从测试集前几个)
if subject in self.test_data:
questions = self.test_data[subject]
return random.sample(questions[:20], min(num_shots, len(questions), 20))
return []
def build_evaluation_prompt(self, examples: List[MMLUQuestion], test_question: MMLUQuestion) -> str:
"""构建评估提示词"""
prompt = "以下是一些多选题示例,请选择最佳答案。\n\n"
# 添加Few-shot示例
for i, example in enumerate(examples):
prompt += f"示例 {i+1}:\n"
prompt += f"问题: {example.question}\n"
for j, choice in enumerate(example.choices):
prompt += f"{chr(65+j)}. {choice}\n"
prompt += f"答案: {example.answer}\n\n"
# 添加测试问题
prompt += "现在请回答以下问题:\n"
prompt += f"问题: {test_question.question}\n"
for j, choice in enumerate(test_question.choices):
prompt += f"{chr(65+j)}. {choice}\n"
prompt += "答案: "
return prompt
def extract_choice(self, model_response: str) -> str:
"""从模型响应中提取选择"""
response = model_response.strip().upper()
# 寻找A、B、C、D
for choice in ['A', 'B', 'C', 'D']:
if choice in response:
return choice
# 如果没有找到,返回第一个字符(如果是有效选择)
if response and response[0] in ['A', 'B', 'C', 'D']:
return response[0]
return 'A' # 默认返回A
def calculate_overall_stats(self, results: Dict[str, Any]) -> Dict[str, Any]:
"""计算总体统计信息"""
total_questions = sum(r["total"] for r in results.values())
total_correct = sum(r["correct"] for r in results.values())
overall_accuracy = total_correct / total_questions if total_questions > 0 else 0
# 按类别统计
category_stats = {}
for category, subjects in MMLU_SUBJECTS.items():
category_correct = 0
category_total = 0
for subject in subjects:
if subject in results:
category_correct += results[subject]["correct"]
category_total += results[subject]["total"]
if category_total > 0:
category_stats[category] = {
"accuracy": category_correct / category_total,
"correct": category_correct,
"total": category_total
}
return {
"overall_accuracy": overall_accuracy,
"total_correct": total_correct,
"total_questions": total_questions,
"category_breakdown": category_stats,
"subject_count": len(results)
}
class MMLUAnalyzer:
def __init__(self, evaluation_results: Dict):
self.results = evaluation_results
def analyze_error_patterns(self) -> Dict[str, Any]:
"""分析错误模式"""
error_analysis = {
"by_subject": {},
"by_category": {},
"by_difficulty": {},
"common_mistakes": []
}
# 按学科分析错误
for subject, result in self.results["subject_results"].items():
if result["total"] > 0:
error_rate = 1 - result["accuracy"]
error_analysis["by_subject"][subject] = {
"error_rate": error_rate,
"error_count": result["total"] - result["correct"],
"sample_errors": self.get_sample_errors(result["details"])
}
# 按类别分析
for category, subjects in MMLU_SUBJECTS.items():
category_errors = []
for subject in subjects:
if subject in self.results["subject_results"]:
subject_result = self.results["subject_results"][subject]
category_errors.extend([
d for d in subject_result["details"]
if not d["is_correct"]
])
if category_errors:
error_analysis["by_category"][category] = {
"total_errors": len(category_errors),
"error_rate": len(category_errors) / sum(
self.results["subject_results"][s]["total"]
for s in subjects
if s in self.results["subject_results"]
),
"sample_errors": category_errors[:5] # 前5个错误样本
}
return error_analysis
def get_sample_errors(self, details: List[Dict], limit: int = 3) -> List[Dict]:
"""获取错误样本"""
errors = [d for d in details if not d["is_correct"]]
return errors[:limit]
def generate_improvement_suggestions(self) -> List[str]:
"""生成改进建议"""
suggestions = []
# 分析整体表现
overall_accuracy = self.results["overall_stats"]["overall_accuracy"]
if overall_accuracy < 0.5:
suggestions.append("整体准确率较低,建议增强基础知识训练")
# 分析类别表现
category_stats = self.results["overall_stats"]["category_breakdown"]
for category, stats in category_stats.items():
if stats["accuracy"] < overall_accuracy - 0.1:
suggestions.append(f"{category}领域表现相对较弱,建议加强相关知识训练")
# 分析具体学科
subject_results = self.results["subject_results"]
weak_subjects = [
subject for subject, result in subject_results.items()
if result["accuracy"] < 0.3
]
if weak_subjects:
suggestions.append(f"以下学科需要重点改进:{', '.join(weak_subjects[:5])}")
return suggestions
def compare_with_baseline(self, baseline_results: Dict) -> Dict[str, Any]:
"""与基准结果比较"""
comparison = {
"overall_improvement": 0,
"subject_improvements": {},
"category_improvements": {}
}
# 整体比较
current_accuracy = self.results["overall_stats"]["overall_accuracy"]
baseline_accuracy = baseline_results["overall_stats"]["overall_accuracy"]
comparison["overall_improvement"] = current_accuracy - baseline_accuracy
# 学科比较
for subject in self.results["subject_results"]:
if subject in baseline_results["subject_results"]:
current_acc = self.results["subject_results"][subject]["accuracy"]
baseline_acc = baseline_results["subject_results"][subject]["accuracy"]
comparison["subject_improvements"][subject] = current_acc - baseline_acc
return comparison
import asyncio
from datetime import datetime
import pandas as pd
class AutoMMLUEvaluator:
def __init__(self, model_configs: List[Dict]):
self.model_configs = model_configs
self.evaluator = MMLUEvaluator("./mmlu_data")
async def batch_evaluate_models(self) -> Dict[str, Any]:
"""批量评估多个模型"""
all_results = {}
for config in self.model_configs:
model_name = config["name"]
inference_func = config["inference_function"]
print(f"开始评估模型: {model_name}")
start_time = datetime.now()
try:
results = self.evaluator.evaluate_model(
model_inference_func=inference_func,
shots=config.get("shots", 5)
)
results["model_name"] = model_name
results["evaluation_time"] = (datetime.now() - start_time).total_seconds()
results["config"] = config
all_results[model_name] = results
print(f"{model_name} 评估完成,总体准确率: {results['overall_stats']['overall_accuracy']:.3f}")
except Exception as e:
print(f"{model_name} 评估失败: {e}")
all_results[model_name] = {"error": str(e)}
return all_results
def generate_leaderboard(self, results: Dict[str, Any]) -> pd.DataFrame:
"""生成排行榜"""
leaderboard_data = []
for model_name, result in results.items():
if "error" not in result:
row = {
"Model": model_name,
"Overall Accuracy": f"{result['overall_stats']['overall_accuracy']:.3f}",
"Total Questions": result['overall_stats']['total_questions'],
"Correct Answers": result['overall_stats']['total_correct']
}
# 添加类别准确率
for category, stats in result['overall_stats']['category_breakdown'].items():
row[f"{category} Acc"] = f"{stats['accuracy']:.3f}"
leaderboard_data.append(row)
df = pd.DataFrame(leaderboard_data)
return df.sort_values("Overall Accuracy", ascending=False)
def export_detailed_report(self, results: Dict[str, Any], output_path: str):
"""导出详细报告"""
report = {
"evaluation_summary": {
"timestamp": datetime.now().isoformat(),
"models_evaluated": len(results),
"total_subjects": len(MMLU_SUBJECTS)
},
"model_results": results,
"leaderboard": self.generate_leaderboard(results).to_dict('records')
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(report, f, ensure_ascii=False, indent=2)
print(f"详细报告已保存至: {output_path}")
# 使用示例
def gpt4_inference(prompt: str) -> str:
"""GPT-4推理函数示例"""
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0,
max_tokens=5
)
return response.choices[0].message.content
model_configs = [
{
"name": "GPT-4",
"inference_function": gpt4_inference,
"shots": 5
}
# 可以添加更多模型配置
]
evaluator = AutoMMLUEvaluator(model_configs)
results = await evaluator.batch_evaluate_models()
evaluator.export_detailed_report(results, "mmlu_evaluation_report.json")
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
class MMLUVisualizer:
def __init__(self, evaluation_results: Dict):
self.results = evaluation_results
plt.rcParams['font.sans-serif'] = ['SimHei'] # 支持中文
plt.rcParams['axes.unicode_minus'] = False
def plot_category_performance(self, model_name: str = None):
"""绘制类别性能图"""
if model_name and model_name in self.results:
data = self.results[model_name]
else:
# 使用第一个模型的结果
data = next(iter(self.results.values()))
category_stats = data["overall_stats"]["category_breakdown"]
categories = list(category_stats.keys())
accuracies = [stats["accuracy"] for stats in category_stats.values()]
plt.figure(figsize=(12, 6))
bars = plt.bar(categories, accuracies, color='skyblue', alpha=0.7)
# 添加数值标签
for bar, acc in zip(bars, accuracies):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
f'{acc:.3f}', ha='center', va='bottom')
plt.title(f'MMLU Categories Performance - {model_name or "Model"}')
plt.xlabel('Knowledge Categories')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()
def plot_subject_heatmap(self, top_n: int = 20):
"""绘制学科表现热力图"""
if not self.results:
return
# 收集所有模型和学科的准确率
models = list(self.results.keys())
all_subjects = set()
for result in self.results.values():
if "subject_results" in result:
all_subjects.update(result["subject_results"].keys())
# 选择表现最好/最差的前N个学科
subject_scores = {}
for subject in all_subjects:
scores = []
for model_result in self.results.values():
if "subject_results" in model_result and subject in model_result["subject_results"]:
scores.append(model_result["subject_results"][subject]["accuracy"])
if scores:
subject_scores[subject] = np.mean(scores)
# 排序并选择top N
sorted_subjects = sorted(subject_scores.items(), key=lambda x: x[1], reverse=True)
selected_subjects = [s[0] for s in sorted_subjects[:top_n]]
# 构建热力图数据
heatmap_data = []
for model in models:
model_row = []
for subject in selected_subjects:
if ("subject_results" in self.results[model] and
subject in self.results[model]["subject_results"]):
acc = self.results[model]["subject_results"][subject]["accuracy"]
model_row.append(acc)
else:
model_row.append(0)
heatmap_data.append(model_row)
# 绘制热力图
plt.figure(figsize=(15, max(8, len(models) * 0.5)))
sns.heatmap(heatmap_data,
xticklabels=selected_subjects,
yticklabels=models,
annot=True,
fmt='.3f',
cmap='RdYlGn',
center=0.5)
plt.title(f'MMLU Subject Performance Heatmap (Top {top_n} Subjects)')
plt.xlabel('Subjects')
plt.ylabel('Models')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
def plot_model_comparison(self):
"""绘制模型对比图"""
if len(self.results) < 2:
print("需要至少2个模型进行对比")
return
models = list(self.results.keys())
overall_accs = []
category_data = {}
for model in models:
result = self.results[model]
if "overall_stats" in result:
overall_accs.append(result["overall_stats"]["overall_accuracy"])
# 收集类别数据
for category, stats in result["overall_stats"]["category_breakdown"].items():
if category not in category_data:
category_data[category] = []
category_data[category].append(stats["accuracy"])
# 整体性能对比
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# 总体准确率对比
bars1 = ax1.bar(models, overall_accs, color='lightcoral', alpha=0.7)
ax1.set_title('Overall MMLU Performance')
ax1.set_ylabel('Accuracy')
ax1.set_ylim(0, 1)
for bar, acc in zip(bars1, overall_accs):
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
f'{acc:.3f}', ha='center', va='bottom')
# 类别性能雷达图(简化为柱状图)
categories = list(category_data.keys())
x = np.arange(len(categories))
width = 0.35
for i, model in enumerate(models):
model_scores = [category_data[cat][i] for cat in categories]
ax2.bar(x + i * width, model_scores, width,
label=model, alpha=0.7)
ax2.set_title('Category Performance Comparison')
ax2.set_xlabel('Categories')
ax2.set_ylabel('Accuracy')
ax2.set_xticks(x + width / 2)
ax2.set_xticklabels(categories, rotation=45)
ax2.legend()
ax2.set_ylim(0, 1)
plt.tight_layout()
plt.show()