1. 面试题目 #
请详细阐述如何评测大模型的幻觉问题?请从评测方法、幻觉类型、技术实现等多个维度进行分析,并结合具体实例说明如何设计有效的幻觉检测和评估系统。同时,请讨论在实际应用中如何平衡评测的准确性和效率。
2. 参考答案 #
2.1 大模型幻觉问题概述 #
大模型幻觉(Hallucination)是指大型语言模型在生成内容时产生的事实性错误、逻辑矛盾或与输入不符的信息。这是当前大模型应用中的核心挑战之一,直接影响模型的可靠性和实用性。
2.2 核心评测方法 #
2.2.1 实际场景测试(Real-world Scenario Testing) #
核心原理: 将大模型应用于真实场景中,检查其生成内容是否包含逻辑错误或事实性错误。
实现方法与示例:
class RealWorldScenarioTester:
def __init__(self, model, test_scenarios):
self.model = model
self.test_scenarios = test_scenarios
self.hallucination_detector = HallucinationDetector()
def test_scenario(self, scenario):
"""测试单个场景"""
response = self.model.generate(scenario['prompt'])
# 检测幻觉
hallucination_score = self.hallucination_detector.detect(
prompt=scenario['prompt'],
response=response,
ground_truth=scenario.get('ground_truth')
)
return {
'scenario_id': scenario['id'],
'response': response,
'hallucination_score': hallucination_score,
'is_hallucination': hallucination_score > 0.5
}
def batch_test(self):
"""批量测试多个场景"""
results = []
for scenario in self.test_scenarios:
result = self.test_scenario(scenario)
results.append(result)
# 统计分析
hallucination_rate = sum(1 for r in results if r['is_hallucination']) / len(results)
avg_score = sum(r['hallucination_score'] for r in results) / len(results)
return {
'total_scenarios': len(results),
'hallucination_rate': hallucination_rate,
'average_score': avg_score,
'detailed_results': results
}
# 使用示例
test_scenarios = [
{
'id': 'medical_1',
'prompt': '请解释高血压的病因和治疗方法',
'ground_truth': '高血压主要由遗传、年龄、生活方式等因素引起...'
},
{
'id': 'historical_1',
'prompt': '请描述第二次世界大战的主要事件',
'ground_truth': '二战从1939年9月1日开始...'
}
]
tester = RealWorldScenarioTester(model, test_scenarios)
results = tester.batch_test()2.2.2 人类评价(Human Evaluation) #
核心原理: 让多个专家或普通用户对大模型生成的内容进行评价,对幻觉的频率和严重程度进行打分或标注。
实现方法与示例:
class HumanEvaluationSystem:
def __init__(self):
self.evaluators = []
self.evaluation_criteria = {
'factual_accuracy': '事实准确性',
'logical_consistency': '逻辑一致性',
'coherence': '连贯性',
'relevance': '相关性'
}
def add_evaluator(self, evaluator_id, expertise_level, domain):
"""添加评估者"""
self.evaluators.append({
'id': evaluator_id,
'expertise_level': expertise_level, # 'expert', 'intermediate', 'novice'
'domain': domain,
'evaluations': []
})
def create_evaluation_task(self, model_response, ground_truth, criteria):
"""创建评估任务"""
task = {
'task_id': f"eval_{len(self.evaluators)}",
'model_response': model_response,
'ground_truth': ground_truth,
'criteria': criteria,
'evaluations': []
}
return task
def submit_evaluation(self, evaluator_id, task_id, scores, comments):
"""提交评估结果"""
evaluation = {
'evaluator_id': evaluator_id,
'task_id': task_id,
'scores': scores, # {criterion: score}
'comments': comments,
'timestamp': datetime.now()
}
# 找到对应的评估者并添加评估
for evaluator in self.evaluators:
if evaluator['id'] == evaluator_id:
evaluator['evaluations'].append(evaluation)
break
def analyze_evaluations(self, task_id):
"""分析评估结果"""
all_evaluations = []
for evaluator in self.evaluators:
for eval in evaluator['evaluations']:
if eval['task_id'] == task_id:
all_evaluations.append(eval)
if not all_evaluations:
return None
# 计算平均分
criteria_scores = {}
for criterion in self.evaluation_criteria:
scores = [eval['scores'].get(criterion, 0) for eval in all_evaluations]
criteria_scores[criterion] = {
'average': sum(scores) / len(scores),
'min': min(scores),
'max': max(scores),
'std': self.calculate_std(scores)
}
# 计算整体幻觉分数
overall_score = sum(criteria_scores[c]['average'] for c in criteria_scores) / len(criteria_scores)
return {
'task_id': task_id,
'overall_score': overall_score,
'criteria_scores': criteria_scores,
'evaluation_count': len(all_evaluations),
'is_hallucination': overall_score < 0.6 # 阈值可调整
}
def calculate_std(self, scores):
"""计算标准差"""
if len(scores) < 2:
return 0
mean = sum(scores) / len(scores)
variance = sum((x - mean) ** 2 for x in scores) / len(scores)
return variance ** 0.5
# 使用示例
eval_system = HumanEvaluationSystem()
eval_system.add_evaluator('expert_1', 'expert', 'medicine')
eval_system.add_evaluator('expert_2', 'expert', 'medicine')
eval_system.add_evaluator('user_1', 'novice', 'general')
# 创建评估任务
task = eval_system.create_evaluation_task(
model_response="高血压可以通过大量饮酒来治疗...", # 明显的幻觉
ground_truth="高血压需要药物治疗和生活方式改变...",
criteria=['factual_accuracy', 'logical_consistency']
)
# 模拟评估结果
eval_system.submit_evaluation('expert_1', task['task_id'],
{'factual_accuracy': 0.1, 'logical_consistency': 0.2},
"严重的事实错误")
eval_system.submit_evaluation('expert_2', task['task_id'],
{'factual_accuracy': 0.0, 'logical_consistency': 0.1},
"完全错误的信息")
analysis = eval_system.analyze_evaluations(task['task_id'])2.2.3 自动化评价(Automated Evaluation) #
核心原理: 将大模型的输出与预先准备的标准答案进行对比,使用算法统计分析偏差程度。
实现方法与示例:
class AutomatedHallucinationEvaluator:
def __init__(self, reference_answers, similarity_model):
self.reference_answers = reference_answers
self.similarity_model = similarity_model
self.metrics = {
'bleu': BLEUScorer(),
'rouge': ROUGEScorer(),
'bert_score': BERTScorer(),
'factual_consistency': FactualConsistencyScorer()
}
def evaluate_response(self, query, model_response, reference_id):
"""评估单个回答"""
reference = self.reference_answers[reference_id]
evaluation_results = {}
# 1. 文本相似度评估
evaluation_results['bleu'] = self.metrics['bleu'].score(
[model_response], [reference['answer']]
)
evaluation_results['rouge'] = self.metrics['rouge'].score(
model_response, reference['answer']
)
evaluation_results['bert_score'] = self.metrics['bert_score'].score(
[model_response], [reference['answer']]
)
# 2. 事实一致性评估
evaluation_results['factual_consistency'] = self.metrics['factual_consistency'].score(
query, model_response, reference['facts']
)
# 3. 逻辑一致性评估
evaluation_results['logical_consistency'] = self.evaluate_logical_consistency(
model_response, reference['logical_structure']
)
# 4. 综合幻觉分数
hallucination_score = self.calculate_hallucination_score(evaluation_results)
return {
'query': query,
'model_response': model_response,
'reference': reference,
'metrics': evaluation_results,
'hallucination_score': hallucination_score,
'is_hallucination': hallucination_score > 0.5
}
def evaluate_logical_consistency(self, response, logical_structure):
"""评估逻辑一致性"""
# 使用NLI模型评估逻辑一致性
nli_model = self.similarity_model
consistency_scores = []
for premise, conclusion in logical_structure:
# 检查前提和结论的逻辑关系
score = nli_model.predict_entailment(premise, conclusion)
consistency_scores.append(score)
return sum(consistency_scores) / len(consistency_scores) if consistency_scores else 0.5
def calculate_hallucination_score(self, metrics):
"""计算综合幻觉分数"""
# 权重配置
weights = {
'bleu': 0.2,
'rouge': 0.2,
'bert_score': 0.3,
'factual_consistency': 0.2,
'logical_consistency': 0.1
}
# 归一化分数并计算加权平均
normalized_scores = {}
for metric, score in metrics.items():
if metric in weights:
# 将相似度分数转换为幻觉分数(1-相似度)
normalized_scores[metric] = 1 - score
hallucination_score = sum(
normalized_scores[metric] * weights[metric]
for metric in weights
)
return hallucination_score
def batch_evaluate(self, test_cases):
"""批量评估"""
results = []
for case in test_cases:
result = self.evaluate_response(
case['query'],
case['model_response'],
case['reference_id']
)
results.append(result)
# 统计分析
hallucination_count = sum(1 for r in results if r['is_hallucination'])
avg_hallucination_score = sum(r['hallucination_score'] for r in results) / len(results)
return {
'total_cases': len(results),
'hallucination_count': hallucination_count,
'hallucination_rate': hallucination_count / len(results),
'average_hallucination_score': avg_hallucination_score,
'detailed_results': results
}
# 使用示例
reference_answers = {
'medical_1': {
'answer': '高血压的病因包括遗传因素、年龄增长、不良生活方式等...',
'facts': ['高血压有遗传倾向', '年龄是重要风险因素', '需要药物治疗'],
'logical_structure': [
('高血压是慢性疾病', '需要长期管理'),
('遗传因素影响', '家族史是风险因素')
]
}
}
evaluator = AutomatedHallucinationEvaluator(reference_answers, similarity_model)2.2.4 数据自行检测(Self-detection with Data) #
核心原理: 设计常识性、定义明确、无歧义的问题来测试大模型的理解和回答能力,观察其幻觉率。
实现方法与示例:
class SelfDetectionTester:
def __init__(self, model):
self.model = model
self.test_datasets = {
'commonsense': CommonsenseQADataset(),
'factual': FactualQADataset(),
'logical': LogicalReasoningDataset(),
'mathematical': MathematicalReasoningDataset()
}
def test_commonsense_qa(self):
"""测试常识问答"""
dataset = self.test_datasets['commonsense']
results = []
for question in dataset.get_questions():
model_answer = self.model.generate(question['text'])
correct_answer = question['correct_answer']
# 检查答案正确性
is_correct = self.check_answer_correctness(
model_answer, correct_answer, question['type']
)
# 检测幻觉模式
hallucination_patterns = self.detect_hallucination_patterns(
question['text'], model_answer
)
results.append({
'question_id': question['id'],
'question': question['text'],
'model_answer': model_answer,
'correct_answer': correct_answer,
'is_correct': is_correct,
'hallucination_patterns': hallucination_patterns
})
return self.analyze_commonsense_results(results)
def test_factual_qa(self):
"""测试事实问答"""
dataset = self.test_datasets['factual']
results = []
for question in dataset.get_questions():
model_answer = self.model.generate(question['text'])
# 事实性检查
factual_accuracy = self.check_factual_accuracy(
model_answer, question['ground_truth']
)
# 检测事实性幻觉
factual_hallucinations = self.detect_factual_hallucinations(
model_answer, question['domain']
)
results.append({
'question_id': question['id'],
'question': question['text'],
'model_answer': model_answer,
'factual_accuracy': factual_accuracy,
'factual_hallucinations': factual_hallucinations
})
return self.analyze_factual_results(results)
def test_logical_reasoning(self):
"""测试逻辑推理"""
dataset = self.test_datasets['logical']
results = []
for problem in dataset.get_problems():
model_solution = self.model.generate(problem['text'])
# 逻辑一致性检查
logical_consistency = self.check_logical_consistency(
problem['premises'], model_solution, problem['conclusion']
)
# 检测逻辑幻觉
logical_hallucinations = self.detect_logical_hallucinations(
problem['text'], model_solution
)
results.append({
'problem_id': problem['id'],
'problem': problem['text'],
'model_solution': model_solution,
'logical_consistency': logical_consistency,
'logical_hallucinations': logical_hallucinations
})
return self.analyze_logical_results(results)
def check_answer_correctness(self, model_answer, correct_answer, question_type):
"""检查答案正确性"""
if question_type == 'multiple_choice':
return model_answer.strip().lower() == correct_answer.strip().lower()
elif question_type == 'open_ended':
# 使用语义相似度检查
similarity = self.calculate_semantic_similarity(model_answer, correct_answer)
return similarity > 0.8
else:
return False
def detect_hallucination_patterns(self, question, answer):
"""检测幻觉模式"""
patterns = []
# 检查是否包含未在问题中提到的信息
if self.contains_unmentioned_information(question, answer):
patterns.append('unmentioned_information')
# 检查是否包含矛盾信息
if self.contains_contradictory_information(answer):
patterns.append('contradictory_information')
# 检查是否包含过度自信的表达
if self.contains_overconfident_language(answer):
patterns.append('overconfident_language')
return patterns
def detect_factual_hallucinations(self, answer, domain):
"""检测事实性幻觉"""
hallucinations = []
# 使用领域特定的知识库检查事实
domain_kb = self.get_domain_knowledge_base(domain)
# 提取答案中的事实声明
factual_claims = self.extract_factual_claims(answer)
for claim in factual_claims:
if not domain_kb.verify_fact(claim):
hallucinations.append({
'claim': claim,
'type': 'factual_error',
'severity': self.assess_severity(claim, domain)
})
return hallucinations
def detect_logical_hallucinations(self, problem, solution):
"""检测逻辑幻觉"""
hallucinations = []
# 检查逻辑推理步骤
reasoning_steps = self.extract_reasoning_steps(solution)
for i, step in enumerate(reasoning_steps):
# 检查步骤的逻辑有效性
if not self.is_logically_valid(step, reasoning_steps[:i]):
hallucinations.append({
'step': step,
'position': i,
'type': 'logical_error',
'description': '逻辑推理错误'
})
return hallucinations
def analyze_commonsense_results(self, results):
"""分析常识问答结果"""
total_questions = len(results)
correct_answers = sum(1 for r in results if r['is_correct'])
hallucination_cases = sum(1 for r in results if r['hallucination_patterns'])
return {
'total_questions': total_questions,
'accuracy': correct_answers / total_questions,
'hallucination_rate': hallucination_cases / total_questions,
'common_patterns': self.analyze_common_patterns(results),
'detailed_results': results
}
def analyze_factual_results(self, results):
"""分析事实问答结果"""
total_questions = len(results)
avg_factual_accuracy = sum(r['factual_accuracy'] for r in results) / total_questions
total_hallucinations = sum(len(r['factual_hallucinations']) for r in results)
return {
'total_questions': total_questions,
'average_factual_accuracy': avg_factual_accuracy,
'total_hallucinations': total_hallucinations,
'hallucination_rate': total_hallucinations / total_questions,
'detailed_results': results
}
def analyze_logical_results(self, results):
"""分析逻辑推理结果"""
total_problems = len(results)
avg_logical_consistency = sum(r['logical_consistency'] for r in results) / total_problems
total_logical_hallucinations = sum(len(r['logical_hallucinations']) for r in results)
return {
'total_problems': total_problems,
'average_logical_consistency': avg_logical_consistency,
'total_logical_hallucinations': total_logical_hallucinations,
'hallucination_rate': total_logical_hallucinations / total_problems,
'detailed_results': results
}
# 使用示例
tester = SelfDetectionTester(model)
commonsense_results = tester.test_commonsense_qa()
factual_results = tester.test_factual_qa()
logical_results = tester.test_logical_reasoning()2.3 幻觉类型与高级分析 #
2.3.1 奇怪的逻辑链条(Strange Logical Chains) #
特征: 大模型有时会生成逻辑有问题的推理过程,这些幻觉需要结合逻辑审查进行评测。
检测方法:
class LogicalChainAnalyzer:
def __init__(self, logic_model):
self.logic_model = logic_model
def analyze_logical_chain(self, premises, conclusion, reasoning_steps):
"""分析逻辑链条"""
analysis = {
'premises': premises,
'conclusion': conclusion,
'reasoning_steps': reasoning_steps,
'logical_errors': [],
'consistency_score': 0.0
}
# 检查每个推理步骤
for i, step in enumerate(reasoning_steps):
step_analysis = self.analyze_reasoning_step(
step, premises, reasoning_steps[:i]
)
if step_analysis['has_error']:
analysis['logical_errors'].append({
'step_index': i,
'step': step,
'error_type': step_analysis['error_type'],
'description': step_analysis['description']
})
# 计算整体一致性分数
analysis['consistency_score'] = self.calculate_consistency_score(analysis)
return analysis
def analyze_reasoning_step(self, step, premises, previous_steps):
"""分析单个推理步骤"""
# 检查步骤的逻辑有效性
validity = self.logic_model.check_validity(step, premises, previous_steps)
if not validity['is_valid']:
return {
'has_error': True,
'error_type': validity['error_type'],
'description': validity['description']
}
return {'has_error': False}
def calculate_consistency_score(self, analysis):
"""计算一致性分数"""
total_steps = len(analysis['reasoning_steps'])
error_steps = len(analysis['logical_errors'])
if total_steps == 0:
return 1.0
return 1.0 - (error_steps / total_steps)
# 使用示例
logic_analyzer = LogicalChainAnalyzer(logic_model)
# 测试逻辑链条
premises = ["所有鸟都会飞", "企鹅是鸟"]
conclusion = "企鹅会飞"
reasoning_steps = [
"企鹅是鸟",
"所有鸟都会飞",
"因此企鹅会飞"
]
analysis = logic_analyzer.analyze_logical_chain(premises, conclusion, reasoning_steps)
print(f"逻辑一致性分数: {analysis['consistency_score']}")
print(f"逻辑错误数量: {len(analysis['logical_errors'])}")2.3.2 领域特定内容评测(Domain-Specific Content Evaluation) #
特征: 对于专业领域(如医学、法律),需要该领域的专家参与评估大模型的回答。
实现方法:
class DomainSpecificEvaluator:
def __init__(self, domain, expert_knowledge_base):
self.domain = domain
self.expert_kb = expert_knowledge_base
self.domain_rules = self.load_domain_rules(domain)
def evaluate_domain_response(self, query, response, expert_feedback=None):
"""评估领域特定回答"""
evaluation = {
'domain': self.domain,
'query': query,
'response': response,
'domain_accuracy': 0.0,
'compliance_score': 0.0,
'expert_rating': 0.0,
'violations': []
}
# 1. 领域知识准确性检查
evaluation['domain_accuracy'] = self.check_domain_accuracy(response)
# 2. 领域规则合规性检查
evaluation['compliance_score'] = self.check_domain_compliance(response)
# 3. 专家反馈整合
if expert_feedback:
evaluation['expert_rating'] = expert_feedback['rating']
evaluation['expert_comments'] = expert_feedback['comments']
# 4. 违规检测
evaluation['violations'] = self.detect_domain_violations(response)
return evaluation
def check_domain_accuracy(self, response):
"""检查领域知识准确性"""
# 提取响应中的事实声明
factual_claims = self.extract_factual_claims(response)
accuracy_scores = []
for claim in factual_claims:
# 与专家知识库对比
accuracy = self.expert_kb.verify_claim(claim, self.domain)
accuracy_scores.append(accuracy)
return sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0.0
def check_domain_compliance(self, response):
"""检查领域规则合规性"""
compliance_score = 1.0
for rule in self.domain_rules:
if not rule.check_compliance(response):
compliance_score -= rule.penalty
return max(0.0, compliance_score)
def detect_domain_violations(self, response):
"""检测领域违规"""
violations = []
for rule in self.domain_rules:
if not rule.check_compliance(response):
violations.append({
'rule_id': rule.id,
'rule_name': rule.name,
'violation_type': rule.violation_type,
'severity': rule.severity,
'description': rule.get_violation_description(response)
})
return violations
def load_domain_rules(self, domain):
"""加载领域规则"""
if domain == 'medical':
return self.load_medical_rules()
elif domain == 'legal':
return self.load_legal_rules()
elif domain == 'financial':
return self.load_financial_rules()
else:
return []
# 医学领域规则示例
class MedicalRule:
def __init__(self, rule_id, name, check_function, penalty, severity):
self.id = rule_id
self.name = name
self.check_function = check_function
self.penalty = penalty
self.severity = severity
def check_compliance(self, response):
return self.check_function(response)
def get_violation_description(self, response):
return f"违反了{self.name}规则"
# 使用示例
medical_rules = [
MedicalRule(
'med_001',
'不得提供具体诊断建议',
lambda r: '诊断' not in r and '确诊' not in r,
0.3,
'high'
),
MedicalRule(
'med_002',
'必须包含免责声明',
lambda r: '仅供参考' in r or '咨询医生' in r,
0.2,
'medium'
)
]
evaluator = DomainSpecificEvaluator('medical', medical_kb)2.3.3 混淆概念/事实基础问题(Confused Concepts/Factual Basis Issues) #
特征: 大模型经常将不相关的信息进行组合,需要设置容易混淆的知识点进行评测。
检测方法:
class ConceptConfusionDetector:
def __init__(self, concept_graph, confusion_patterns):
self.concept_graph = concept_graph
self.confusion_patterns = confusion_patterns
def detect_concept_confusion(self, response, domain=None):
"""检测概念混淆"""
confusions = []
# 1. 提取响应中的概念
concepts = self.extract_concepts(response)
# 2. 检查概念间的关系
for i, concept1 in enumerate(concepts):
for concept2 in concepts[i+1:]:
confusion = self.check_concept_relationship(concept1, concept2)
if confusion:
confusions.append(confusion)
# 3. 检查已知的混淆模式
pattern_confusions = self.check_confusion_patterns(response)
confusions.extend(pattern_confusions)
return confusions
def extract_concepts(self, response):
"""提取响应中的概念"""
# 使用NER和概念识别模型
entities = self.ner_model.extract_entities(response)
concepts = []
for entity in entities:
if entity['type'] in ['PERSON', 'ORG', 'GPE', 'EVENT']:
concepts.append({
'text': entity['text'],
'type': entity['type'],
'confidence': entity['confidence']
})
return concepts
def check_concept_relationship(self, concept1, concept2):
"""检查概念间关系"""
# 检查概念是否相关
if self.concept_graph.are_related(concept1['text'], concept2['text']):
return None
# 检查是否容易混淆
confusion_score = self.calculate_confusion_score(concept1, concept2)
if confusion_score > 0.7:
return {
'concept1': concept1,
'concept2': concept2,
'confusion_score': confusion_score,
'confusion_type': 'unrelated_concepts',
'description': f"将不相关的概念{concept1['text']}和{concept2['text']}进行了关联"
}
return None
def check_confusion_patterns(self, response):
"""检查已知的混淆模式"""
confusions = []
for pattern in self.confusion_patterns:
if pattern.matches(response):
confusions.append({
'pattern_id': pattern.id,
'pattern_name': pattern.name,
'confusion_type': 'known_pattern',
'description': pattern.description,
'severity': pattern.severity
})
return confusions
def calculate_confusion_score(self, concept1, concept2):
"""计算混淆分数"""
# 基于语义相似度和概念关系计算
semantic_similarity = self.calculate_semantic_similarity(
concept1['text'], concept2['text']
)
# 如果语义相似但概念不相关,则混淆分数高
if semantic_similarity > 0.8 and not self.concept_graph.are_related(
concept1['text'], concept2['text']
):
return semantic_similarity
return 0.0
# 混淆模式示例
class ConfusionPattern:
def __init__(self, pattern_id, name, pattern_regex, description, severity):
self.id = pattern_id
self.name = name
self.pattern_regex = pattern_regex
self.description = description
self.severity = severity
def matches(self, text):
import re
return bool(re.search(self.pattern_regex, text, re.IGNORECASE))
# 使用示例
confusion_patterns = [
ConfusionPattern(
'conf_001',
'时间混淆',
r'(\d{4}年).*?(\d{4}年)',
'将不同年份的事件混淆',
'medium'
),
ConfusionPattern(
'conf_002',
'人物混淆',
r'(张三|李四).*?(王五|赵六)',
'将不同人物混淆',
'high'
)
]
detector = ConceptConfusionDetector(concept_graph, confusion_patterns)2.3.4 批量评测与细粒度分析(Batch Evaluation and Fine-Grained Analysis) #
特征: 设计大规模问答评测,应用批量测试方法,使用细粒度分析定位问题根源。
实现方法:
class BatchHallucinationEvaluator:
def __init__(self, model, evaluation_metrics):
self.model = model
self.metrics = evaluation_metrics
self.batch_size = 32
self.parallel_workers = 4
def batch_evaluate(self, test_dataset, evaluation_config):
"""批量评估"""
results = []
# 分批处理
for batch in self.create_batches(test_dataset, self.batch_size):
batch_results = self.evaluate_batch(batch, evaluation_config)
results.extend(batch_results)
# 细粒度分析
analysis = self.fine_grained_analysis(results)
return {
'total_samples': len(results),
'batch_results': results,
'fine_grained_analysis': analysis,
'summary_statistics': self.calculate_summary_statistics(results)
}
def evaluate_batch(self, batch, config):
"""评估单个批次"""
batch_results = []
for sample in batch:
# 生成模型回答
model_response = self.model.generate(
sample['query'],
max_length=config.get('max_length', 512),
temperature=config.get('temperature', 0.7)
)
# 多维度评估
evaluation = self.comprehensive_evaluate(
sample['query'],
model_response,
sample.get('ground_truth'),
sample.get('reference_facts')
)
batch_results.append({
'sample_id': sample['id'],
'query': sample['query'],
'model_response': model_response,
'ground_truth': sample.get('ground_truth'),
'evaluation': evaluation
})
return batch_results
def comprehensive_evaluate(self, query, response, ground_truth, reference_facts):
"""综合评估"""
evaluation = {}
# 1. 事实准确性
if ground_truth:
evaluation['factual_accuracy'] = self.metrics['factual_accuracy'].score(
response, ground_truth
)
# 2. 逻辑一致性
evaluation['logical_consistency'] = self.metrics['logical_consistency'].score(
query, response
)
# 3. 连贯性
evaluation['coherence'] = self.metrics['coherence'].score(response)
# 4. 相关性
evaluation['relevance'] = self.metrics['relevance'].score(query, response)
# 5. 幻觉检测
evaluation['hallucination_score'] = self.metrics['hallucination_detector'].score(
query, response, reference_facts
)
# 6. 领域特定评估
if reference_facts:
evaluation['domain_specific'] = self.metrics['domain_evaluator'].score(
response, reference_facts
)
return evaluation
def fine_grained_analysis(self, results):
"""细粒度分析"""
analysis = {
'hallucination_patterns': self.analyze_hallucination_patterns(results),
'error_distribution': self.analyze_error_distribution(results),
'performance_by_domain': self.analyze_performance_by_domain(results),
'correlation_analysis': self.analyze_correlations(results),
'outlier_analysis': self.analyze_outliers(results)
}
return analysis
def analyze_hallucination_patterns(self, results):
"""分析幻觉模式"""
patterns = {
'factual_errors': [],
'logical_errors': [],
'contradictions': [],
'hallucinated_facts': []
}
for result in results:
evaluation = result['evaluation']
if evaluation['hallucination_score'] > 0.7:
# 分析幻觉类型
if evaluation.get('factual_accuracy', 1.0) < 0.5:
patterns['factual_errors'].append(result)
if evaluation.get('logical_consistency', 1.0) < 0.5:
patterns['logical_errors'].append(result)
# 检测矛盾
if self.detect_contradictions(result['model_response']):
patterns['contradictions'].append(result)
# 检测虚构事实
if self.detect_hallucinated_facts(result['model_response']):
patterns['hallucinated_facts'].append(result)
return patterns
def analyze_error_distribution(self, results):
"""分析错误分布"""
error_distribution = {
'by_severity': {'low': 0, 'medium': 0, 'high': 0},
'by_type': {},
'by_domain': {}
}
for result in results:
evaluation = result['evaluation']
hallucination_score = evaluation['hallucination_score']
# 按严重程度分类
if hallucination_score < 0.3:
error_distribution['by_severity']['low'] += 1
elif hallucination_score < 0.7:
error_distribution['by_severity']['medium'] += 1
else:
error_distribution['by_severity']['high'] += 1
# 按类型分类
error_type = self.classify_error_type(evaluation)
error_distribution['by_type'][error_type] = error_distribution['by_type'].get(error_type, 0) + 1
# 按领域分类
domain = result.get('domain', 'general')
error_distribution['by_domain'][domain] = error_distribution['by_domain'].get(domain, 0) + 1
return error_distribution
def analyze_performance_by_domain(self, results):
"""按领域分析性能"""
domain_performance = {}
for result in results:
domain = result.get('domain', 'general')
if domain not in domain_performance:
domain_performance[domain] = {
'total_samples': 0,
'hallucination_scores': [],
'accuracy_scores': []
}
domain_performance[domain]['total_samples'] += 1
domain_performance[domain]['hallucination_scores'].append(
result['evaluation']['hallucination_score']
)
domain_performance[domain]['accuracy_scores'].append(
result['evaluation'].get('factual_accuracy', 0.0)
)
# 计算每个领域的统计信息
for domain, data in domain_performance.items():
data['avg_hallucination_score'] = sum(data['hallucination_scores']) / len(data['hallucination_scores'])
data['avg_accuracy'] = sum(data['accuracy_scores']) / len(data['accuracy_scores'])
data['hallucination_rate'] = sum(1 for score in data['hallucination_scores'] if score > 0.5) / len(data['hallucination_scores'])
return domain_performance
def analyze_correlations(self, results):
"""分析相关性"""
correlations = {}
# 提取数值特征
features = []
for result in results:
evaluation = result['evaluation']
features.append({
'factual_accuracy': evaluation.get('factual_accuracy', 0.0),
'logical_consistency': evaluation.get('logical_consistency', 0.0),
'coherence': evaluation.get('coherence', 0.0),
'relevance': evaluation.get('relevance', 0.0),
'hallucination_score': evaluation['hallucination_score']
})
# 计算特征间相关性
import numpy as np
from scipy.stats import pearsonr
feature_names = list(features[0].keys())
correlation_matrix = np.zeros((len(feature_names), len(feature_names)))
for i, feature1 in enumerate(feature_names):
for j, feature2 in enumerate(feature_names):
values1 = [f[feature1] for f in features]
values2 = [f[feature2] for f in features]
corr, _ = pearsonr(values1, values2)
correlation_matrix[i][j] = corr
correlations['matrix'] = correlation_matrix.tolist()
correlations['feature_names'] = feature_names
return correlations
def analyze_outliers(self, results):
"""分析异常值"""
outliers = {
'high_hallucination': [],
'low_performance': [],
'unusual_patterns': []
}
# 计算阈值
hallucination_scores = [r['evaluation']['hallucination_score'] for r in results]
avg_hallucination = sum(hallucination_scores) / len(hallucination_scores)
std_hallucination = (sum((x - avg_hallucination) ** 2 for x in hallucination_scores) / len(hallucination_scores)) ** 0.5
for result in results:
evaluation = result['evaluation']
hallucination_score = evaluation['hallucination_score']
# 高幻觉异常值
if hallucination_score > avg_hallucination + 2 * std_hallucination:
outliers['high_hallucination'].append(result)
# 低性能异常值
if evaluation.get('factual_accuracy', 1.0) < 0.3:
outliers['low_performance'].append(result)
# 异常模式检测
if self.detect_unusual_patterns(result):
outliers['unusual_patterns'].append(result)
return outliers
# 使用示例
evaluator = BatchHallucinationEvaluator(model, evaluation_metrics)
results = evaluator.batch_evaluate(test_dataset, evaluation_config)2.3.5 增强与透明性工具(Enhancement and Transparency Tools) #
特征: 使用LIME、SHAP等工具解释模型的决策路径,帮助理解模型如何得出幻觉结论。
实现方法:
`python
class ModelTransparencyAnalyzer:
def init(self, model, explainer_models):
self.model = model
self.explainer_models = explainer_models # LIME, SHAP等
self.attention_analyzer = AttentionAnalyzer()
def analyze_model_decision(self, query, response, ground_truth=None):
"""分析模型决策过程"""
analysis = {
'query': query,
'response': response,
'decision_path': {},
'attention_weights': {},
'feature_importance': {},
'hallucination_explanation': {}
}
# 1. 注意力权重分析
analysis['attention_weights'] = self.analyze_attention_weights(query, response)
# 2. 特征重要性分析
analysis['feature_importance'] = self.analyze_feature_importance(query, response)
# 3. 决策路径分析
analysis['decision_path'] = self.analyze_decision_path(query, response)
# 4. 幻觉解释
if ground_truth:
analysis['hallucination_explanation'] = self.explain_hallucination(
query, response, ground_truth
)
return analysis
def analyze_attention_weights(self, query, response):
"""分析注意力权重"""
# 获取模型的注意力权重
attention_weights = self.model.get_attention_weights(query, response)
analysis = {
'query_attention': attention_weights['query_attention'],
'response_attention': attention_weights['response_attention'],
'cross_attention': attention_weights['cross_attention'],
'attention_patterns': self.identify_attention_patterns(attention_weights)
}
return analysis
def analyze_feature_importance(self, query, response):
"""分析特征重要性"""
feature_importance = {}
# 使用LIME分析
if 'lime' in self.explainer_models:
lime_explainer = self.explainer_models['lime']
lime_explanation = lime_explainer.explain_instance(
query, self.model.predict_proba
)
feature_importance['lime'] = lime_explanation.as_list()
# 使用SHAP分析
if 'shap' in self.explainer_models:
shap_explainer = self.explainer_models['shap']
shap_values = shap_explainer.shap_values(query)
feature_importance['shap'] = shap_values.tolist()
return feature_importance
def analyze_decision_path(self, query, response):
"""分析决策路径"""
decision_path = {
'input_processing': self.analyze_input_processing(query),
'reasoning_steps': self.extract_reasoning_steps(query, response),
'output_generation': self.analyze_output_generation(response),
'confidence_scores': self.calculate_confidence_scores(query, response)
}
return decision_path
def explain_hallucination(self, query, response, ground_truth):
"""解释幻觉产生原因"""
explanation = {
'factual_discrepancies': self.identify_factual_discrepancies(response, ground_truth),
'logical_errors': self.identify_logical_errors(query, response),
'attention_misalignment': self.identify_attention_misalignment(query, response),
'confidence_calibration': self.