超越模仿游戏的综合评估基准,204个任务测试LLM能力边界
from typing import Dict, List, Any, Optional
import json
import random
import numpy as np
class BIGBenchDataset:
"""BIG-bench数据集管理"""
def __init__(self):
self.task_categories = {
"语言学和语言理解": {
"tasks": [
"语法错误检测", "语义消歧", "语言翻译",
"方言识别", "语音学分析", "语法分析"
],
"count": 42,
"description": "测试语言结构和语义理解能力"
},
"数学和逻辑推理": {
"tasks": [
"数学运算", "几何问题", "逻辑推理",
"概率计算", "代数求解", "证明构造"
],
"count": 38,
"description": "评估数学计算和逻辑思维能力"
},
"科学知识和推理": {
"tasks": [
"物理原理", "化学反应", "生物学概念",
"天文学知识", "地质学理解", "环境科学"
],
"count": 45,
"description": "检验科学领域的知识掌握和应用"
},
"常识推理和社会理解": {
"tasks": [
"社会常识", "道德判断", "文化理解",
"心理学概念", "人际关系", "社会规范"
],
"count": 36,
"description": "测试对人类社会和行为的理解"
},
"创造性思维和问题解决": {
"tasks": [
"创意写作", "问题分解", "类比推理",
"假设检验", "策略制定", "创新思考"
],
"count": 28,
"description": "评估创造性和复杂问题解决能力"
},
"偏见检测和安全性": {
"tasks": [
"性别偏见检测", "种族偏见识别", "年龄歧视判断",
"宗教偏见分析", "社会刻板印象", "公平性评估"
],
"count": 15,
"description": "检测模型中的偏见和安全隐患"
}
}
self.initialize_tasks()
def initialize_tasks(self):
"""初始化任务数据"""
self.all_tasks = {}
task_id = 0
for category, info in self.task_categories.items():
for task_name in info["tasks"]:
self.all_tasks[f"bigbench_{task_id}"] = {
"id": f"bigbench_{task_id}",
"name": task_name,
"category": category,
"difficulty": random.choice(["medium", "hard", "very_hard"]),
"requires_chain_of_thought": random.choice([True, False]),
"max_score": 1.0
}
task_id += 1
print(f"BIG-bench: 初始化了 {len(self.all_tasks)} 个任务")
def get_bigbench_hard_tasks(self) -> Dict[str, Any]:
"""获取BIG-bench Hard任务子集"""
# 23个最具挑战性的任务
bbh_tasks = {
"boolean_expressions": "布尔表达式求值",
"causal_judgement": "因果关系判断",
"date_understanding": "日期理解",
"disambiguation_qa": "消歧问答",
"dyck_languages": "Dyck语言识别",
"formal_fallacies": "形式逻辑谬误",
"geometric_shapes": "几何形状推理",
"hyperbaton": "倒装句理解",
"logical_deduction": "逻辑推演",
"movie_recommendation": "电影推荐",
"multistep_arithmetic": "多步算术",
"navigate": "导航推理",
"object_counting": "物体计数",
"penguins_in_a_table": "表格推理",
"reasoning_about_colored_objects": "彩色物体推理",
"ruin_names": "名字破坏",
"salient_translation_error_detection": "翻译错误检测",
"snarks": "讽刺理解",
"sports_understanding": "体育理解",
"temporal_sequences": "时序推理",
"tracking_shuffled_objects": "物体追踪",
"web_of_lies": "谎言网络",
"word_sorting": "词汇排序"
}
return bbh_tasks
def analyze_task_distribution(self) -> Dict[str, Any]:
"""分析任务分布"""
distribution = {}
for category, info in self.task_categories.items():
distribution[category] = {
"task_count": info["count"],
"percentage": info["count"] / 204 * 100,
"sample_tasks": info["tasks"][:3]
}
return {
"total_tasks": 204,
"category_distribution": distribution,
"bbh_subset": len(self.get_bigbench_hard_tasks()),
"difficulty_levels": self.get_difficulty_distribution()
}
def get_difficulty_distribution(self) -> Dict[str, int]:
"""获取难度分布"""
difficulty_count = {}
for task in self.all_tasks.values():
difficulty = task["difficulty"]
difficulty_count[difficulty] = difficulty_count.get(difficulty, 0) + 1
return difficulty_count
class BIGBenchEvaluator:
"""BIG-bench评估器"""
def __init__(self):
self.dataset = BIGBenchDataset()
self.evaluation_history = []
self.setup_evaluation_methods()
def setup_evaluation_methods(self):
"""设置评估方法"""
self.evaluation_methods = {
"zero_shot": self.zero_shot_evaluation,
"few_shot": self.few_shot_evaluation,
"chain_of_thought": self.chain_of_thought_evaluation,
"few_shot_cot": self.few_shot_cot_evaluation
}
def evaluate_model(self,
model_inference_func,
evaluation_method: str = "few_shot",
num_shots: int = 3,
sample_tasks: int = None,
use_cot: bool = True) -> Dict[str, Any]:
"""评估模型在BIG-bench上的表现"""
print(f"开始BIG-bench评估,方法: {evaluation_method}")
# 选择评估任务
tasks_to_evaluate = list(self.dataset.all_tasks.values())
if sample_tasks:
tasks_to_evaluate = random.sample(tasks_to_evaluate, min(sample_tasks, len(tasks_to_evaluate)))
evaluation_results = {}
category_performance = {}
for i, task in enumerate(tasks_to_evaluate):
print(f"评估任务 {i+1}/{len(tasks_to_evaluate)}: {task['name']}")
try:
# 选择评估方法
eval_method = self.evaluation_methods[evaluation_method]
task_result = eval_method(
model_inference_func,
task,
num_shots=num_shots,
use_cot=use_cot
)
evaluation_results[task["id"]] = task_result
# 按类别统计
category = task["category"]
if category not in category_performance:
category_performance[category] = []
category_performance[category].append(task_result)
except Exception as e:
print(f"任务 {task['name']} 评估失败: {e}")
evaluation_results[task["id"]] = {
"score": 0.0,
"error": str(e),
"task_info": task
}
# 计算总体统计
overall_stats = self.calculate_bigbench_statistics(
evaluation_results, category_performance
)
return {
"task_results": evaluation_results,
"category_performance": category_performance,
"overall_statistics": overall_stats,
"metadata": {
"evaluation_method": evaluation_method,
"num_shots": num_shots,
"use_chain_of_thought": use_cot,
"total_tasks_evaluated": len(tasks_to_evaluate),
"timestamp": time.time()
}
}
def zero_shot_evaluation(self, model_func, task: Dict, **kwargs) -> Dict[str, Any]:
"""零样本评估"""
prompt = f"""
任务: {task['name']}
请直接回答以下问题,不需要解释过程。
问题: {self.generate_sample_question(task)}
答案:"""
try:
response = model_func(prompt)
score = self.score_response(task, response)
return {
"score": score,
"response": response,
"method": "zero_shot",
"task_info": task
}
except Exception as e:
return {
"score": 0.0,
"error": str(e),
"task_info": task
}
def few_shot_evaluation(self, model_func, task: Dict, num_shots: int = 3, **kwargs) -> Dict[str, Any]:
"""少样本评估"""
# 构建few-shot示例
examples = self.generate_few_shot_examples(task, num_shots)
prompt = f"任务: {task['name']}\n\n"
prompt += "以下是一些示例:\n\n"
for i, example in enumerate(examples):
prompt += f"示例 {i+1}:\n"
prompt += f"问题: {example['question']}\n"
prompt += f"答案: {example['answer']}\n\n"
prompt += "现在请回答:\n"
prompt += f"问题: {self.generate_sample_question(task)}\n"
prompt += "答案:"
try:
response = model_func(prompt)
score = self.score_response(task, response)
return {
"score": score,
"response": response,
"method": "few_shot",
"num_shots": num_shots,
"task_info": task
}
except Exception as e:
return {
"score": 0.0,
"error": str(e),
"task_info": task
}
def chain_of_thought_evaluation(self, model_func, task: Dict, **kwargs) -> Dict[str, Any]:
"""思维链评估"""
prompt = f"""
任务: {task['name']}
请逐步思考并解决以下问题。请先说明你的思考过程,然后给出最终答案。
问题: {self.generate_sample_question(task)}
思考过程:"""
try:
response = model_func(prompt)
score = self.score_response(task, response)
return {
"score": score,
"response": response,
"method": "chain_of_thought",
"reasoning_steps": self.extract_reasoning_steps(response),
"task_info": task
}
except Exception as e:
return {
"score": 0.0,
"error": str(e),
"task_info": task
}
def few_shot_cot_evaluation(self, model_func, task: Dict, num_shots: int = 3, **kwargs) -> Dict[str, Any]:
"""少样本+思维链评估"""
# 构建带推理过程的示例
examples = self.generate_cot_examples(task, num_shots)
prompt = f"任务: {task['name']}\n\n"
prompt += "以下是一些带有思考过程的示例:\n\n"
for i, example in enumerate(examples):
prompt += f"示例 {i+1}:\n"
prompt += f"问题: {example['question']}\n"
prompt += f"思考: {example['reasoning']}\n"
prompt += f"答案: {example['answer']}\n\n"
prompt += "现在请按照同样的方式思考并回答:\n"
prompt += f"问题: {self.generate_sample_question(task)}\n"
prompt += "思考:"
try:
response = model_func(prompt)
score = self.score_response(task, response)
return {
"score": score,
"response": response,
"method": "few_shot_cot",
"num_shots": num_shots,
"reasoning_quality": self.assess_reasoning_quality(response),
"task_info": task
}
except Exception as e:
return {
"score": 0.0,
"error": str(e),
"task_info": task
}
def generate_sample_question(self, task: Dict) -> str:
"""生成示例问题"""
task_name = task["name"]
# 根据任务类型生成相应问题
if "数学" in task_name or "算术" in task_name:
return "计算: (23 + 17) × 4 - 15 ÷ 3 = ?"
elif "逻辑" in task_name:
return "如果所有的鸟都会飞,企鹅是鸟,那么企鹅会飞吗?请解释。"
elif "语言" in task_name:
return "请找出以下句子中的语法错误:'我昨天去了商店买一些苹果。'"
elif "常识" in task_name:
return "为什么人们通常在晚上睡觉而不是在白天?"
else:
return f"这是一个关于{task_name}的测试问题。"
def generate_few_shot_examples(self, task: Dict, num_shots: int) -> List[Dict[str, str]]:
"""生成少样本示例"""
examples = []
for i in range(num_shots):
examples.append({
"question": f"示例问题 {i+1}",
"answer": f"示例答案 {i+1}"
})
return examples
def generate_cot_examples(self, task: Dict, num_shots: int) -> List[Dict[str, str]]:
"""生成思维链示例"""
examples = []
for i in range(num_shots):
examples.append({
"question": f"示例问题 {i+1}",
"reasoning": f"让我逐步思考这个问题:首先...然后...因此...",
"answer": f"示例答案 {i+1}"
})
return examples
def score_response(self, task: Dict, response: str) -> float:
"""评分响应质量"""
# 简化的评分逻辑(实际应用中会更复杂)
if not response or len(response.strip()) < 5:
return 0.0
# 基于任务类型的评分
task_name = task["name"].lower()
if "数学" in task_name or "算术" in task_name:
# 数学题评分:检查是否包含数字和计算过程
if any(char.isdigit() for char in response):
return random.uniform(0.6, 1.0)
else:
return random.uniform(0.0, 0.4)
elif "逻辑" in task_name:
# 逻辑题评分:检查推理过程
reasoning_indicators = ["因为", "所以", "因此", "由于", "推理", "逻辑"]
if any(indicator in response for indicator in reasoning_indicators):
return random.uniform(0.7, 1.0)
else:
return random.uniform(0.2, 0.6)
else:
# 其他任务的通用评分
return random.uniform(0.3, 0.8)
def extract_reasoning_steps(self, response: str) -> List[str]:
"""提取推理步骤"""
# 寻找推理标识词
reasoning_markers = ["首先", "然后", "接下来", "最后", "因此", "所以"]
steps = []
sentences = response.split('。')
for sentence in sentences:
if any(marker in sentence for marker in reasoning_markers):
steps.append(sentence.strip())
return steps
def assess_reasoning_quality(self, response: str) -> Dict[str, Any]:
"""评估推理质量"""
reasoning_steps = self.extract_reasoning_steps(response)
return {
"num_reasoning_steps": len(reasoning_steps),
"reasoning_depth": "深入" if len(reasoning_steps) >= 3 else "浅显",
"logical_structure": "清晰" if len(reasoning_steps) > 0 else "模糊",
"step_quality": sum(len(step.split()) for step in reasoning_steps) / len(reasoning_steps) if reasoning_steps else 0
}
def calculate_bigbench_statistics(self,
task_results: Dict[str, Any],
category_performance: Dict[str, List[Any]]) -> Dict[str, Any]:
"""计算BIG-bench统计数据"""
# 总体性能统计
all_scores = [result["score"] for result in task_results.values() if "score" in result]
overall_stats = {
"mean_score": np.mean(all_scores) if all_scores else 0,
"median_score": np.median(all_scores) if all_scores else 0,
"std_score": np.std(all_scores) if all_scores else 0,
"min_score": np.min(all_scores) if all_scores else 0,
"max_score": np.max(all_scores) if all_scores else 0,
"total_tasks": len(task_results),
"successful_tasks": len(all_scores)
}
# 按类别统计
category_stats = {}
for category, results in category_performance.items():
scores = [r["score"] for r in results if "score" in r]
if scores:
category_stats[category] = {
"mean_score": np.mean(scores),
"task_count": len(results),
"success_rate": len(scores) / len(results),
"difficulty_analysis": self.analyze_category_difficulty(category, results)
}
# 能力边界分析
capability_analysis = self.analyze_capability_boundaries(task_results)
# 与人类基准对比
human_comparison = self.compare_with_human_baseline(overall_stats)
return {
"overall_performance": overall_stats,
"category_breakdown": category_stats,
"capability_analysis": capability_analysis,
"human_comparison": human_comparison,
"improvement_areas": self.identify_improvement_areas(category_stats)
}
def analyze_capability_boundaries(self, task_results: Dict[str, Any]) -> Dict[str, Any]:
"""分析能力边界"""
# 找出表现最好和最差的任务
scored_tasks = [(task_id, result["score"]) for task_id, result in task_results.items() if "score" in result]
scored_tasks.sort(key=lambda x: x[1])
worst_tasks = scored_tasks[:10] # 最差的10个任务
best_tasks = scored_tasks[-10:] # 最好的10个任务
# 分析失败模式
failure_patterns = self.analyze_failure_patterns(
[task_results[task_id] for task_id, _ in worst_tasks]
)
# 分析成功模式
success_patterns = self.analyze_success_patterns(
[task_results[task_id] for task_id, _ in best_tasks]
)
return {
"worst_performing_tasks": [
{"task_id": task_id, "score": score, "task_name": task_results[task_id]["task_info"]["name"]}
for task_id, score in worst_tasks
],
"best_performing_tasks": [
{"task_id": task_id, "score": score, "task_name": task_results[task_id]["task_info"]["name"]}
for task_id, score in best_tasks
],
"failure_patterns": failure_patterns,
"success_patterns": success_patterns
}
def analyze_failure_patterns(self, failed_results: List[Dict]) -> List[str]:
"""分析失败模式"""
patterns = []
# 分析任务类型
task_types = [result["task_info"]["category"] for result in failed_results]
most_failed_category = max(set(task_types), key=task_types.count)
patterns.append(f"在{most_failed_category}类别任务中表现较差")
# 分析是否需要思维链
cot_required_tasks = [
result for result in failed_results
if result["task_info"].get("requires_chain_of_thought", False)
]
if len(cot_required_tasks) > len(failed_results) * 0.6:
patterns.append("在需要多步推理的任务中表现不佳")
return patterns
def analyze_success_patterns(self, success_results: List[Dict]) -> List[str]:
"""分析成功模式"""
patterns = []
# 分析成功的任务类型
task_types = [result["task_info"]["category"] for result in success_results]
most_success_category = max(set(task_types), key=task_types.count)
patterns.append(f"在{most_success_category}类别任务中表现优秀")
return patterns
def compare_with_human_baseline(self, model_stats: Dict[str, Any]) -> Dict[str, Any]:
"""与人类基准对比"""
# 模拟人类基准数据(实际数据来自BIG-bench论文)
human_baseline = {
"mean_score": 0.85, # 人类平均分
"expert_score": 0.92, # 专家分数
"non_expert_score": 0.78 # 非专家分数
}
model_score = model_stats["mean_score"]
comparison_result = {
"model_vs_human_avg": model_score - human_baseline["mean_score"],
"model_vs_expert": model_score - human_baseline["expert_score"],
"model_vs_non_expert": model_score - human_baseline["non_expert_score"],
"performance_level": self.categorize_performance_level(model_score, human_baseline)
}
return comparison_result
def categorize_performance_level(self, model_score: float, human_baseline: Dict) -> str:
"""分类性能级别"""
if model_score >= human_baseline["expert_score"]:
return "超越专家水平"
elif model_score >= human_baseline["mean_score"]:
return "达到人类平均水平"
elif model_score >= human_baseline["non_expert_score"]:
return "接近人类水平"
else:
return "低于人类水平"
def identify_improvement_areas(self, category_stats: Dict[str, Any]) -> List[str]:
"""识别改进领域"""
improvements = []
# 找出表现最差的类别
sorted_categories = sorted(
category_stats.items(),
key=lambda x: x[1]["mean_score"]
)
worst_categories = sorted_categories[:3] # 最差的3个类别
for category, stats in worst_categories:
if stats["mean_score"] < 0.5:
improvements.append(f"加强{category}领域的训练")
# 检查思维链需求
overall_avg = np.mean([stats["mean_score"] for stats in category_stats.values()])
if overall_avg < 0.6:
improvements.append("增强多步推理和思维链能力")
return improvements
# 使用示例
def example_model_inference(prompt: str) -> str:
"""示例模型推理函数"""
# 这里应该调用实际的LLM模型
return f"这是模型对以下提示的回答:{prompt[:50]}..."
# 执行BIG-bench评估
evaluator = BIGBenchEvaluator()
# 评估示例
results = evaluator.evaluate_model(
model_inference_func=example_model_inference,
evaluation_method="few_shot_cot",
num_shots=3,
sample_tasks=50, # 评估50个任务
use_cot=True
)
print("BIG-bench评估完成!")
print(f"总体平均分: {results['overall_statistics']['overall_performance']['mean_score']:.3f}")
print(f"性能级别: {results['overall_statistics']['human_comparison']['performance_level']}")
print(f"改进建议: {results['overall_statistics']['improvement_areas']}")
class BBHEvaluator:
"""BIG-Bench Hard专门评估器"""
def __init__(self):
self.bbh_tasks = self.load_bbh_tasks()
self.setup_scoring_functions()
def load_bbh_tasks(self) -> Dict[str, Dict]:
"""加载BBH任务定义"""
return {
"boolean_expressions": {
"description": "评估布尔表达式的求值能力",
"example_question": "not ( ( not not True ) ) 的值是什么?",
"scoring_type": "exact_match",
"difficulty": "hard"
},
"causal_judgement": {
"description": "评估因果关系判断能力",
"example_question": "如果下雨了,地面会湿。现在地面湿了,是否意味着下雨了?",
"scoring_type": "classification",
"difficulty": "hard"
},
"logical_deduction": {
"description": "评估逻辑推演能力",
"example_question": "A坐在B的左边,B坐在C的左边,那么A相对于C的位置是?",
"scoring_type": "exact_match",
"difficulty": "very_hard"
},
"multistep_arithmetic": {
"description": "评估多步算术计算能力",
"example_question": "((15 + 3) × 2 - 6) ÷ 4 + 7 = ?",
"scoring_type": "numerical",
"difficulty": "medium"
},
"temporal_sequences": {
"description": "评估时序推理能力",
"example_question": "如果今天是星期三,那么5天后是星期几?",
"scoring_type": "exact_match",
"difficulty": "hard"
}
}
def setup_scoring_functions(self):
"""设置评分函数"""
self.scoring_functions = {
"exact_match": self.exact_match_scoring,
"classification": self.classification_scoring,
"numerical": self.numerical_scoring,
"semantic": self.semantic_scoring
}
def evaluate_bbh_performance(self, model_inference_func, use_cot: bool = True) -> Dict[str, Any]:
"""评估BBH性能"""
print("开始BBH (BIG-Bench Hard) 评估...")
bbh_results = {}
for task_name, task_info in self.bbh_tasks.items():
print(f"评估BBH任务: {task_name}")
try:
# 构建提示
if use_cot:
prompt = self.build_cot_prompt(task_info)
else:
prompt = self.build_standard_prompt(task_info)
# 模型推理
response = model_inference_func(prompt)
# 评分
scoring_func = self.scoring_functions[task_info["scoring_type"]]
score = scoring_func(task_info, response)
bbh_results[task_name] = {
"score": score,
"response": response,
"difficulty": task_info["difficulty"],
"scoring_type": task_info["scoring_type"],
"description": task_info["description"]
}
except Exception as e:
print(f"BBH任务 {task_name} 评估失败: {e}")
bbh_results[task_name] = {
"score": 0.0,
"error": str(e),
"difficulty": task_info["difficulty"]
}
# 计算BBH统计
bbh_stats = self.calculate_bbh_statistics(bbh_results)
return {
"bbh_results": bbh_results,
"bbh_statistics": bbh_stats,
"evaluation_method": "chain_of_thought" if use_cot else "standard",
"timestamp": time.time()
}
def build_cot_prompt(self, task_info: Dict) -> str:
"""构建思维链提示"""
return f"""
任务:{task_info['description']}
请逐步思考并解决这个问题:
{task_info['example_question']}
让我一步步分析:
1. 首先,我需要...
2. 然后,...
3. 因此,答案是...
请按照这种方式回答:
"""
def build_standard_prompt(self, task_info: Dict) -> str:
"""构建标准提示"""
return f"""
任务:{task_info['description']}
问题:{task_info['example_question']}
请给出答案:
"""
def exact_match_scoring(self, task_info: Dict, response: str) -> float:
"""精确匹配评分"""
# 模拟评分逻辑
return 1.0 if "正确" in response else 0.0
def classification_scoring(self, task_info: Dict, response: str) -> float:
"""分类评分"""
return random.uniform(0.0, 1.0)
def numerical_scoring(self, task_info: Dict, response: str) -> float:
"""数值评分"""
# 检查是否包含正确的数字
import re
numbers = re.findall(r'\d+(?:\.\d+)?', response)
return 1.0 if numbers else 0.0
def semantic_scoring(self, task_info: Dict, response: str) -> float:
"""语义评分"""
return random.uniform(0.3, 0.9)
def calculate_bbh_statistics(self, bbh_results: Dict[str, Any]) -> Dict[str, Any]:
"""计算BBH统计数据"""
scores = [result["score"] for result in bbh_results.values() if "score" in result]
# 按难度分析
difficulty_stats = {}
for result in bbh_results.values():
if "difficulty" in result:
difficulty = result["difficulty"]
if difficulty not in difficulty_stats:
difficulty_stats[difficulty] = []
if "score" in result:
difficulty_stats[difficulty].append(result["score"])
difficulty_averages = {
diff: np.mean(scores) if scores else 0
for diff, scores in difficulty_stats.items()
}
return {
"overall_bbh_score": np.mean(scores) if scores else 0,
"median_bbh_score": np.median(scores) if scores else 0,
"total_bbh_tasks": len(self.bbh_tasks),
"completed_tasks": len(scores),
"difficulty_breakdown": difficulty_averages,
"hardest_tasks": self.identify_hardest_tasks(bbh_results),
"bbh_vs_overall_comparison": self.compare_bbh_with_overall(scores)
}
def identify_hardest_tasks(self, bbh_results: Dict[str, Any]) -> List[Dict[str, Any]]:
"""识别最困难的任务"""
scored_tasks = [
(task_name, result["score"], result.get("description", ""))
for task_name, result in bbh_results.items()
if "score" in result
]
# 按分数排序,找出最困难的
scored_tasks.sort(key=lambda x: x[1])
return [
{"task_name": name, "score": score, "description": desc}
for name, score, desc in scored_tasks[:5]
]
def compare_bbh_with_overall(self, bbh_scores: List[float]) -> Dict[str, Any]:
"""比较BBH与整体BIG-bench表现"""
bbh_avg = np.mean(bbh_scores) if bbh_scores else 0
# 模拟整体BIG-bench分数(实际应该从完整评估获取)
overall_avg = 0.65 # 假设的整体平均分
return {
"bbh_average": bbh_avg,
"overall_average": overall_avg,
"difficulty_gap": overall_avg - bbh_avg,
"relative_performance": bbh_avg / overall_avg if overall_avg > 0 else 0
}
# BBH评估示例
bbh_evaluator = BBHEvaluator()
bbh_results = bbh_evaluator.evaluate_bbh_performance(
example_model_inference,
use_cot=True
)
print("BBH评估结果:")
print(f"BBH平均分: {bbh_results['bbh_statistics']['overall_bbh_score']:.3f}")
print(f"最困难任务: {bbh_results['bbh_statistics']['hardest_tasks'][0]['task_name']}")
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
class BIGBenchVisualizer:
"""BIG-bench结果可视化器"""
def __init__(self, evaluation_results: Dict[str, Any]):
self.results = evaluation_results
plt.rcParams['font.sans-serif'] = ['SimHei']
def create_performance_dashboard(self):
"""创建性能仪表板"""
# 创建子图
fig = make_subplots(
rows=2, cols=2,
subplot_titles=("类别性能分布", "难度级别对比", "方法对比", "能力雷达图"),
specs=[[{"type": "bar"}, {"type": "bar"}],
[{"type": "bar"}, {"type": "scatterpolar"}]]
)
# 1. 类别性能分布
category_stats = self.results["overall_statistics"]["category_breakdown"]
categories = list(category_stats.keys())
scores = [stats["mean_score"] for stats in category_stats.values()]
fig.add_trace(
go.Bar(x=categories, y=scores, name="类别平均分"),
row=1, col=1
)
# 2. 难度级别对比(如果有BBH结果)
if "bbh_statistics" in self.results:
difficulty_breakdown = self.results["bbh_statistics"]["difficulty_breakdown"]
difficulties = list(difficulty_breakdown.keys())
diff_scores = list(difficulty_breakdown.values())
fig.add_trace(
go.Bar(x=difficulties, y=diff_scores, name="难度级别"),
row=1, col=2
)
# 3. 评估方法对比(模拟数据)
methods = ["Zero-shot", "Few-shot", "CoT", "Few-shot+CoT"]
method_scores = [0.45, 0.62, 0.58, 0.73] # 模拟分数
fig.add_trace(
go.Bar(x=methods, y=method_scores, name="评估方法"),
row=2, col=1
)
# 4. 能力雷达图
capabilities = ["语言理解", "数学推理", "科学知识", "常识推理", "创造思维", "偏见检测"]
capability_scores = [0.75, 0.65, 0.70, 0.68, 0.55, 0.72] # 模拟分数
fig.add_trace(
go.Scatterpolar(
r=capability_scores,
theta=capabilities,
fill='toself',
name='模型能力'
),
row=2, col=2
)
# 更新布局
fig.update_layout(
title="BIG-bench评估结果仪表板",
height=800,
showlegend=True
)
fig.show()
def plot_capability_boundaries(self):
"""绘制能力边界图"""
capability_analysis = self.results["overall_statistics"]["capability_analysis"]
worst_tasks = capability_analysis["worst_performing_tasks"]
best_tasks = capability_analysis["best_performing_tasks"]
# 创建对比图
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# 最差任务
worst_names = [task["task_name"][:15] for task in worst_tasks[:8]]
worst_scores = [task["score"] for task in worst_tasks[:8]]
bars1 = ax1.barh(worst_names, worst_scores, color='lightcoral', alpha=0.7)
ax1.set_title('表现最差的任务')
ax1.set_xlabel('评分')
ax1.set_xlim(0, 1)
# 最好任务
best_names = [task["task_name"][:15] for task in best_tasks[:8]]
best_scores = [task["score"] for task in best_tasks[:8]]
bars2 = ax2.barh(best_names, best_scores, color='lightgreen', alpha=0.7)
ax2.set_title('表现最好的任务')
ax2.set_xlabel('评分')
ax2.set_xlim(0, 1)
plt.tight_layout()
plt.show()
def generate_comprehensive_report(self, output_path: str):
"""生成综合评估报告"""
report_content = f"""
# BIG-bench评估综合报告
## 评估概要
- 评估时间: {datetime.fromtimestamp(self.results['metadata']['timestamp']).strftime('%Y-%m-%d %H:%M:%S')}
- 评估方法: {self.results['metadata']['evaluation_method']}
- 总任务数: {self.results['metadata']['total_tasks_evaluated']}
## 整体表现
"""
overall_perf = self.results["overall_statistics"]["overall_performance"]
report_content += f"""
- 平均得分: {overall_perf['mean_score']:.3f}
- 中位数得分: {overall_perf['median_score']:.3f}
- 标准差: {overall_perf['std_score']:.3f}
- 成功完成任务: {overall_perf['successful_tasks']}/{overall_perf['total_tasks']}
"""
# 与人类对比
human_comp = self.results["overall_statistics"]["human_comparison"]
report_content += f"""
## 与人类基准对比
- 性能级别: {human_comp['performance_level']}
- 与人类平均水平差距: {human_comp['model_vs_human_avg']:+.3f}
- 与专家水平差距: {human_comp['model_vs_expert']:+.3f}
"""
# 类别表现
report_content += "\n## 分类别表现\n"
category_breakdown = self.results["overall_statistics"]["category_breakdown"]
for category, stats in category_breakdown.items():
report_content += f"- {category}: {stats['mean_score']:.3f} ({stats['task_count']}个任务)\n"
# 改进建议
improvements = self.results["overall_statistics"]["improvement_areas"]
if improvements:
report_content += "\n## 改进建议\n"
for improvement in improvements:
report_content += f"- {improvement}\n"
# 能力边界分析
capability_analysis = self.results["overall_statistics"]["capability_analysis"]
report_content += f"""
## 能力边界分析
### 最困难任务 (前5)
"""
for task in capability_analysis["worst_performing_tasks"][:5]:
report_content += f"- {task['task_name']}: {task['score']:.3f}\n"
report_content += """
### 擅长任务 (前5)
"""
for task in capability_analysis["best_performing_tasks"][:5]:
report_content += f"- {task['task_name']}: {task['score']:.3f}\n"
# 保存报告
with open(output_path, 'w', encoding='utf-8') as f:
f.write(report_content)
print(f"BIG-bench评估报告已保存至: {output_path}")
# 完整评估流程示例
def comprehensive_bigbench_evaluation():
"""综合BIG-bench评估流程"""
# 初始化评估器
evaluator = BIGBenchEvaluator()
bbh_evaluator = BBHEvaluator()
# 1. 完整BIG-bench评估
print("执行完整BIG-bench评估...")
bigbench_results = evaluator.evaluate_model(
model_inference_func=example_model_inference,
evaluation_method="few_shot_cot",
num_shots=3,
sample_tasks=100, # 评估100个任务
use_cot=True
)
# 2. BBH专门评估
print("执行BBH专门评估...")
bbh_results = bbh_evaluator.evaluate_bbh_performance(
model_inference_func=example_model_inference,
use_cot=True
)
# 3. 合并结果
combined_results = {
**bigbench_results,
"bbh_evaluation": bbh_results
}
# 4. 可视化和报告
visualizer = BIGBenchVisualizer(combined_results)
visualizer.create_performance_dashboard()
visualizer.generate_comprehensive_report("bigbench_evaluation_report.md")
return combined_results
# 执行评估
# results = comprehensive_bigbench_evaluation()