摘要
上下文质量评估是确保RAG和LLM应用效果的关键环节。本文系统讲解Faithfulness(忠实度)、Answer Relevance(答案相关性)、Context Precision/Recall、RAGAS评估框架,以及幻觉检测技术(Self-RAG、SARF),提供完整的评估方法和代码实现。
关键词速览
| 术语 | 英文 | 说明 |
|---|---|---|
| 忠实度 | Faithfulness | 生成内容与上下文的匹配程度 |
| 相关性 | Relevance | 答案与问题的相关程度 |
| 精确度 | Precision | 上下文中的相关比例 |
| 召回率 | Recall | 相关内容的召回比例 |
| 幻觉 | Hallucination | 生成不存在的或错误的内容 |
| RAGAS | RAG Assessment | RAG系统评估框架 |
| Self-RAG | Self-RAG | 自我反思RAG |
| 困惑度 | Perplexity | 语言模型的不确定性 |
一、评估指标体系
1.1 RAG核心评估维度
┌─────────────────────────────────────────────────────────────────┐
│ RAG评估维度 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 上下文质量 │
│ ├── Context Precision (精确度) │
│ ├── Context Recall (召回率) │
│ └── Context Entities Recall (实体召回) │
│ │
│ 答案质量 │
│ ├── Faithfulness (忠实度) │
│ ├── Answer Relevance (答案相关性) │
│ └── Answer Correctness (答案正确性) │
│ │
│ 幻觉检测 │
│ ├── Factual Accuracy (事实准确性) │
│ ├── Citation Accuracy (引用准确性) │
│ └── Consistency (一致性) │
│ │
└─────────────────────────────────────────────────────────────────┘
1.2 指标对比
| 指标 | 测量什么 | 理想值 | 评估方式 |
|---|---|---|---|
| Context Precision | 上下文相关比例 | 1.0 | 自动 |
| Context Recall | 相关内容召回 | 1.0 | 人工/LLM |
| Faithfulness | 内容忠实度 | 1.0 | 自动 |
| Answer Relevance | 答案相关性 | 1.0 | 自动 |
| Hallucination Rate | 幻觉率 | 0.0 | 混合 |
二、Faithfulness评估
2.1 什么是Faithfulness
Faithfulness衡量生成内容与提供上下文的匹配程度:
class FaithfulnessEvaluator:
"""忠实度评估器"""
def __init__(self, llm_client):
self.llm = llm_client
def evaluate(
self,
question: str,
context: str,
answer: str
) -> dict:
"""
评估忠实度
返回:
- score: 0-1的分数
- claims: 从答案中提取的声明
- supported_claims: 被支持的声明
- unsupported_claims: 不被支持的声明
"""
# 1. 提取声明
claims = self._extract_claims(answer)
# 2. 检查每个声明是否被上下文支持
supported = []
unsupported = []
for claim in claims:
if self._is_supported(claim, context):
supported.append(claim)
else:
unsupported.append(claim)
# 3. 计算分数
score = len(supported) / len(claims) if claims else 1.0
return {
'score': score,
'total_claims': len(claims),
'supported_claims': supported,
'unsupported_claims': unsupported,
'faithfulness_level': self._get_level(score)
}
def _extract_claims(self, answer: str) -> List[str]:
"""提取答案中的声明"""
prompt = f"""从以下答案中提取所有可验证的事实声明。
每个声明应该是一个独立的事实陈述。
答案:
{answer}
要求:
1. 只提取明确的事实陈述
2. 每个声明用一行输出
3. 不要提取观点、感受、通用描述
声明列表:"""
result = self.llm.generate(prompt)
claims = [line.strip() for line in result.split('\n') if line.strip()]
return claims
def _is_supported(self, claim: str, context: str) -> bool:
"""检查声明是否被上下文支持"""
prompt = f"""判断以下声明是否可以从提供的上下文中推断或验证。
声明:{claim}
上下文:
{context}
判断标准:
- 如果上下文明确支持或能推断出该声明,返回"支持"
- 如果上下文未提及或与声明矛盾,返回"不支持"
判断结果:"""
result = self.llm.generate(prompt).strip()
return "支持" in result
def _get_level(self, score: float) -> str:
"""获取等级描述"""
if score >= 0.9:
return "Excellent (优秀)"
elif score >= 0.7:
return "Good (良好)"
elif score >= 0.5:
return "Fair (一般)"
else:
return "Poor (较差)"2.2 句子级忠实度
class SentenceLevelFaithfulness:
"""句子级忠实度评估"""
def __init__(self, llm_client):
self.llm = llm_client
def evaluate_sentence_level(
self,
context: str,
answer: str
) -> dict:
"""句子级别评估"""
# 分割答案句子
sentences = self._split_sentences(answer)
results = []
for i, sentence in enumerate(sentences, 1):
verdict = self._evaluate_single_sentence(sentence, context)
results.append({
'sentence': sentence,
'supported': verdict['supported'],
'reason': verdict.get('reason', ''),
'sentence_number': i
})
# 计算整体分数
supported_count = sum(1 for r in results if r['supported'])
overall_score = supported_count / len(results) if results else 1.0
return {
'overall_score': overall_score,
'sentence_results': results,
'unsupported_sentences': [r for r in results if not r['supported']]
}
def _split_sentences(self, text: str) -> List[str]:
"""分割句子"""
import re
sentences = re.split(r'[。!?\n]', text)
return [s.strip() for s in sentences if s.strip()]
def _evaluate_single_sentence(
self,
sentence: str,
context: str
) -> dict:
"""评估单个句子"""
prompt = f"""判断以下句子是否完全由上下文支持。
句子:{sentence}
上下文:
{context}
分析步骤:
1. 检查句子中的每个事实是否在上下文中有依据
2. 检查句子的推断是否合理
3. 检查是否有添加上下文中不存在的信息
判断结果和理由:"""
result = self.llm.generate(prompt)
supported = "支持" in result or "一致" in result
reason = result.split("理由:")[-1].strip() if "理由" in result else ""
return {
'supported': supported,
'reason': reason,
'llm_analysis': result
}三、Answer Relevance评估
3.1 相关性计算
class AnswerRelevanceEvaluator:
"""答案相关性评估"""
def __init__(self, llm_client, embedding_model=None):
self.llm = llm_client
self.embedding = embedding_model
def evaluate(
self,
question: str,
answer: str
) -> dict:
"""
评估答案相关性
方法:
1. 生成问题的多个等价表述
2. 计算与答案的相似度
3. 综合评估
"""
# 1. 生成等价问题
equivalent_questions = self._generate_equivalent_questions(question, answer)
# 2. 计算相似度
if self.embedding:
similarities = self._calculate_embedding_similarity(
answer,
equivalent_questions
)
else:
similarities = self._calculate_keyword_similarity(
answer,
equivalent_questions
)
# 3. 综合评分
score = sum(similarities) / len(similarities)
return {
'score': score,
'equivalent_questions': equivalent_questions,
'similarities': similarities,
'relevance_level': self._get_level(score)
}
def _generate_equivalent_questions(
self,
question: str,
answer: str
) -> List[str]:
"""生成问题的等价表述"""
prompt = f"""基于以下问题和答案,生成3-5个与原问题语义等价但表达不同的问法。
原问题:{question}
答案:
{answer}
要求:
1. 生成的问题应该能够被同样的答案回答
2. 使用不同的词汇和句式
3. 每个问题一行
等价问题:"""
result = self.llm.generate(prompt)
questions = [q.strip() for q in result.split('\n') if q.strip()]
return questions[:5]
def _calculate_embedding_similarity(
self,
answer: str,
questions: List[str]
) -> List[float]:
"""基于embedding计算相似度"""
answer_emb = self.embedding.encode(answer)
similarities = []
for q in questions:
q_emb = self.embedding.encode(q)
sim = self._cosine_similarity(answer_emb, q_emb)
similarities.append(sim)
return similarities
def _calculate_keyword_similarity(
self,
answer: str,
questions: List[str]
) -> List[float]:
"""基于关键词计算相似度"""
answer_keywords = set(answer.lower().split())
similarities = []
for q in questions:
q_keywords = set(q.lower().split())
if not q_keywords:
similarities.append(0)
continue
overlap = len(answer_keywords & q_keywords)
similarity = overlap / len(q_keywords)
similarities.append(similarity)
return similarities
@staticmethod
def _cosine_similarity(a, b):
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
return dot / (norm_a * norm_b + 1e-8)
@staticmethod
def _get_level(score: float) -> str:
if score >= 0.8:
return "Highly Relevant (高度相关)"
elif score >= 0.6:
return "Relevant (相关)"
elif score >= 0.4:
return "Partially Relevant (部分相关)"
else:
return "Irrelevant (不相关)"四、Context Precision与Recall
4.1 Context Precision
class ContextPrecisionEvaluator:
"""上下文精确度评估"""
def __init__(self, llm_client):
self.llm = llm_client
def evaluate(
self,
question: str,
contexts: List[str],
ground_truth: str = None
) -> dict:
"""
评估上下文精确度
Context Precision = 相关块中相关块的比例
"""
if not contexts:
return {'score': 0, 'precision': 0, 'details': []}
# 评估每个上下文块的相关性
relevance_scores = []
for i, ctx in enumerate(contexts):
relevance = self._assess_relevance(question, ctx)
relevance_scores.append({
'context_id': i,
'context_preview': ctx[:100] + '...' if len(ctx) > 100 else ctx,
'relevance': relevance,
'is_relevant': relevance >= 0.5
})
# 计算精确度
relevant_count = sum(1 for r in relevance_scores if r['is_relevant'])
precision = relevant_count / len(contexts) if contexts else 0
return {
'score': precision,
'precision': precision,
'relevant_count': relevant_count,
'total_count': len(contexts),
'details': relevance_scores
}
def _assess_relevance(self, question: str, context: str) -> float:
"""评估单个上下文的相关性"""
prompt = f"""评估以下上下文对于回答问题的相关程度。
问题:{question}
上下文:
{context}
评分标准:
- 1.0: 上下文直接包含回答问题所需的关键信息
- 0.7: 上下文包含大部分相关信息
- 0.5: 上下文包含部分相关信息
- 0.3: 上下文相关度较低
- 0.0: 上下文与问题完全无关
相关度评分(只输出数字0-1):"""
try:
result = self.llm.generate(prompt).strip()
return float(result)
except:
return 0.54.2 Context Recall
class ContextRecallEvaluator:
"""上下文召回率评估"""
def __init__(self, llm_client):
self.llm = llm_client
def evaluate(
self,
contexts: List[str],
ground_truth: str
) -> dict:
"""
评估上下文召回率
Context Recall = 上下文覆盖的ground truth比例
"""
if not contexts:
return {'score': 0, 'recall': 0, 'details': []}
# 合并上下文
combined_context = '\n\n'.join(contexts)
# 提取ground truth中的关键信息
gt_key_points = self._extract_key_points(ground_truth)
# 检查每个关键点是否被上下文覆盖
covered_points = []
uncovered_points = []
for point in gt_key_points:
if self._is_covered(point, combined_context):
covered_points.append(point)
else:
uncovered_points.append(point)
recall = len(covered_points) / len(gt_key_points) if gt_key_points else 0
return {
'score': recall,
'recall': recall,
'total_points': len(gt_key_points),
'covered_points': covered_points,
'uncovered_points': uncovered_points,
'coverage_ratio': f"{len(covered_points)}/{len(gt_key_points)}"
}
def _extract_key_points(self, text: str) -> List[str]:
"""提取关键信息点"""
prompt = f"""从以下文本中提取所有关键信息点。
文本:
{text}
要求:
1. 提取具体的事实、数据、定义
2. 每个信息点一行
3. 不要提取通用描述
关键信息点:"""
result = self.llm.generate(prompt)
points = [p.strip() for p in result.split('\n') if p.strip()]
return points
def _is_covered(self, key_point: str, context: str) -> bool:
"""检查关键点是否被上下文覆盖"""
prompt = f"""判断以下关键信息是否可以从提供的上下文中找到或推断。
关键信息:{key_point}
上下文:
{context}
判断标准:
- 如果上下文中明确包含该信息或可以推断,返回"覆盖"
- 否则返回"未覆盖"
判断结果:"""
result = self.llm.generate(prompt).strip()
return "覆盖" in result五、RAGAS评估框架
5.1 RAGAS实现
from dataclasses import dataclass
from typing import List, Optional
import numpy as np
@dataclass
class RAGASResult:
"""RAGAS评估结果"""
faithfulness: float
answer_relevance: float
context_precision: float
context_recall: Optional[float]
context_entities_recall: float
answer_correctness: float
overall_score: float
class RAGASEvaluator:
"""RAGAS评估框架实现"""
def __init__(
self,
llm_client,
embedding_model=None
):
self.llm = llm_client
self.embedding = embedding_model
self.faithfulness_eval = FaithfulnessEvaluator(llm_client)
self.relevance_eval = AnswerRelevanceEvaluator(llm_client, embedding_model)
self.precision_eval = ContextPrecisionEvaluator(llm_client)
self.recall_eval = ContextRecallEvaluator(llm_client)
def evaluate(
self,
question: str,
answer: str,
contexts: List[str],
ground_truth: str = None
) -> RAGASResult:
"""
完整的RAGAS评估
"""
# 1. Faithfulness
faithfulness_result = self.faithfulness_eval.evaluate(
question, '\n\n'.join(contexts), answer
)
faithfulness = faithfulness_result['score']
# 2. Answer Relevance
relevance_result = self.relevance_eval.evaluate(question, answer)
answer_relevance = relevance_result['score']
# 3. Context Precision
precision_result = self.precision_eval.evaluate(question, contexts)
context_precision = precision_result['precision']
# 4. Context Recall (需要ground truth)
context_recall = None
if ground_truth:
recall_result = self.recall_eval.evaluate(contexts, ground_truth)
context_recall = recall_result['recall']
# 5. Answer Correctness (需要ground truth)
answer_correctness = 0
if ground_truth:
answer_correctness = self._evaluate_answer_correctness(
answer, ground_truth
)
# 6. Context Entities Recall
context_entities_recall = self._evaluate_entity_recall(
contexts, answer
)
# 7. Overall Score
overall = self._calculate_overall(
faithfulness,
answer_relevance,
context_precision,
context_recall,
answer_correctness
)
return RAGASResult(
faithfulness=faithfulness,
answer_relevance=answer_relevance,
context_precision=context_precision,
context_recall=context_recall,
context_entities_recall=context_entities_recall,
answer_correctness=answer_correctness,
overall_score=overall
)
def _evaluate_answer_correctness(
self,
answer: str,
ground_truth: str
) -> float:
"""评估答案正确性"""
prompt = f"""评估生成答案与标准答案的匹配程度。
生成答案:{answer}
标准答案:{ground_truth}
评分标准(0-1):
- 1.0: 完全匹配或等效
- 0.8: 包含大部分正确内容
- 0.6: 包含部分正确内容
- 0.4: 有一些正确内容
- 0.2: 少量正确内容
- 0.0: 完全不匹配
评分:"""
try:
result = self.llm.generate(prompt).strip()
return float(result)
except:
return 0.5
def _evaluate_entity_recall(
self,
contexts: List[str],
answer: str
) -> float:
"""评估实体召回"""
# 提取上下文中的实体
context_entities = set()
for ctx in contexts:
entities = self._extract_entities(ctx)
context_entities.update(entities)
# 提取答案中的实体
answer_entities = set(self._extract_entities(answer))
if not context_entities:
return 1.0
recall = len(answer_entities & context_entities) / len(answer_entities)
return recall
def _extract_entities(self, text: str) -> set:
"""提取实体(简化版)"""
import re
entities = set()
# 人名
entities.update(re.findall(r'[A-Z][a-z]+\s+[A-Z][a-z]+', text))
# 数字
entities.update(re.findall(r'\d+(?:\.\d+)?%?', text))
# 组织名
entities.update(re.findall(r'(?:公司|医院|学校|银行)[^\s,。]{0,10}', text))
return entities
def _calculate_overall(
self,
faithfulness: float,
answer_relevance: float,
context_precision: float,
context_recall: Optional[float],
answer_correctness: float
) -> float:
"""计算综合分数"""
# 不同权重
weights = {
'faithfulness': 0.3,
'answer_relevance': 0.2,
'context_precision': 0.15,
'context_recall': 0.2,
'answer_correctness': 0.15
}
score = (
weights['faithfulness'] * faithfulness +
weights['answer_relevance'] * answer_relevance +
weights['context_precision'] * context_precision
)
if context_recall is not None:
score += (
weights['context_recall'] * context_recall +
weights['answer_correctness'] * answer_correctness
)
else:
# 没有ground truth时,重新分配权重
score = score / (1 - weights['context_recall'] - weights['answer_correctness'])
return score5.2 批量评估
class BatchRAGASEvaluator:
"""批量RAGAS评估"""
def __init__(self, evaluator: RAGASEvaluator):
self.evaluator = evaluator
def evaluate_dataset(
self,
test_cases: List[Dict],
include_ground_truth: bool = True
) -> dict:
"""
批量评估测试集
test_cases格式:
{
'question': str,
'answer': str,
'contexts': List[str],
'ground_truth': Optional[str]
}
"""
results = []
for i, case in enumerate(test_cases):
try:
result = self.evaluator.evaluate(
question=case['question'],
answer=case['answer'],
contexts=case['contexts'],
ground_truth=case.get('ground_truth') if include_ground_truth else None
)
results.append({
'case_id': i,
'success': True,
'result': result
})
except Exception as e:
results.append({
'case_id': i,
'success': False,
'error': str(e)
})
# 汇总统计
successful_results = [r['result'] for r in results if r['success']]
if successful_results:
stats = {
'faithfulness': np.mean([r.faithfulness for r in successful_results]),
'answer_relevance': np.mean([r.answer_relevance for r in successful_results]),
'context_precision': np.mean([r.context_precision for r in successful_results]),
'overall_score': np.mean([r.overall_score for r in successful_results])
}
# 可选指标
recall_scores = [r.context_recall for r in successful_results if r.context_recall is not None]
if recall_scores:
stats['context_recall'] = np.mean(recall_scores)
else:
stats = {}
return {
'total_cases': len(test_cases),
'successful': len(successful_results),
'failed': len(test_cases) - len(successful_results),
'stats': stats,
'detailed_results': results
}六、幻觉检测
6.1 Self-RAG
class SelfRAGDetector:
"""
Self-RAG 幻觉检测
通过自我反思检测和减少幻觉
"""
def __init__(self, llm_client):
self.llm = llm_client
def detect_hallucination(
self,
context: str,
answer: str
) -> dict:
"""
检测幻觉
"""
# 1. 提取声明
claims = self._extract_factual_claims(answer)
# 2. 逐个验证
verified_claims = []
hallucinated_claims = []
for claim in claims:
is_hallucinated, confidence = self._verify_claim(claim, context)
if is_hallucinated:
hallucinated_claims.append({
'claim': claim,
'confidence': confidence,
'severity': self._assess_severity(claim, confidence)
})
else:
verified_claims.append(claim)
# 3. 计算幻觉率
hallucination_rate = len(hallucinated_claims) / len(claims) if claims else 0
return {
'hallucination_rate': hallucination_rate,
'total_claims': len(claims),
'verified_claims': verified_claims,
'hallucinated_claims': hallucinated_claims,
'severity_distribution': self._get_severity_distribution(hallucinated_claims)
}
def _extract_factual_claims(self, text: str) -> List[str]:
"""提取事实声明"""
prompt = f"""从以下文本中提取所有可验证的事实声明。
忽略观点、感受、通用描述。
文本:
{text}
每个声明一行:"""
result = self.llm.generate(prompt)
return [c.strip() for c in result.split('\n') if c.strip()]
def _verify_claim(
self,
claim: str,
context: str
) -> tuple:
"""
验证声明
返回:(是否幻觉, 置信度)
"""
prompt = f"""判断以下声明是否与上下文一致。
声明:{claim}
上下文:
{context}
分析步骤:
1. 检查声明中的每个事实是否在上下文中
2. 检查数字、日期等是否匹配
3. 检查因果关系是否合理
判断:
A. 完全一致 - 上下文明确支持
B. 基本一致 - 上下文暗示或可推断
C. 不确定 - 上下文未提及
D. 不一致 - 上下文与声明矛盾
判断结果和置信度(0-1):"""
result = self.llm.generate(prompt)
# 解析结果
if "A. 完全一致" in result or "完全一致" in result:
return False, 0.95
elif "B. 基本一致" in result or "基本一致" in result:
return False, 0.75
elif "D. 不一致" in result or "不一致" in result:
return True, 0.9
else:
return True, 0.5 # 不确定视为可能的幻觉
def _assess_severity(self, claim: str, confidence: float) -> str:
"""评估严重程度"""
# 检查是否包含关键信息
critical_keywords = ['所有', '每个', '永远', '从不', '100%', '唯一']
has_critical = any(kw in claim for kw in critical_keywords)
if has_critical and confidence > 0.8:
return "High"
elif confidence > 0.9:
return "Medium"
else:
return "Low"
def _get_severity_distribution(self, claims: List[Dict]) -> dict:
"""获取严重程度分布"""
dist = {'High': 0, 'Medium': 0, 'Low': 0}
for c in claims:
dist[c['severity']] = dist.get(c['severity'], 0) + 1
return dist6.2 SARF检测
class SARFDetector:
"""
SARF (Self-Adaptive Reference-Free) 幻觉检测
无需参考的自我适应幻觉检测
"""
def __init__(self, llm_client):
self.llm = llm_client
def detect(
self,
answer: str,
question: str = None
) -> dict:
"""
无参考幻觉检测
"""
# 1. 自洽性检查
consistency = self._check_self_consistency(answer)
# 2. 不确定性检测
uncertainty = self._detect_uncertainty(answer)
# 3. 过度自信检测
overconfidence = self._detect_overconfidence(answer)
# 4. 逻辑一致性
logical_consistency = self._check_logical_consistency(answer)
# 5. 综合评分
overall_score = self._calculate_hallucination_score(
consistency,
uncertainty,
overconfidence,
logical_consistency
)
return {
'hallucination_score': overall_score,
'self_consistency': consistency,
'uncertainty_detected': uncertainty,
'overconfidence_detected': overconfidence,
'logical_consistency': logical_consistency,
'risk_level': self._get_risk_level(overall_score)
}
def _check_self_consistency(self, answer: str) -> float:
"""检查自洽性"""
# 提取关键陈述
statements = self._extract_statements(answer)
if len(statements) < 2:
return 1.0 # 无法检查
# 检查陈述间的一致性
prompt = f"""分析以下陈述之间是否存在逻辑矛盾。
陈述列表:
{chr(10).join([f"{i+1}. {s}" for i, s in enumerate(statements)])}
判断:
- 如果所有陈述逻辑一致,返回"一致"
- 如果存在矛盾,指出矛盾之处
分析结果:"""
result = self.llm.generate(prompt)
if "一致" in result:
return 1.0
elif "矛盾" in result or "冲突" in result:
return 0.3
else:
return 0.7
def _detect_uncertainty(self, answer: str) -> dict:
"""检测不确定性表达"""
uncertainty_markers = [
'可能', '大概', '也许', '似乎', '好像',
'我认为', '据我所知', '一般来说',
'不确定', '无法确定', '不清楚'
]
found_markers = []
for marker in uncertainty_markers:
if marker in answer:
found_markers.append(marker)
# 过多的不确定性可能表明不确定的内容
uncertainty_ratio = len(found_markers) / max(len(answer.split()), 1)
return {
'markers_found': found_markers,
'ratio': uncertainty_ratio,
'is_suspicious': uncertainty_ratio > 0.1 # 超过10%可疑
}
def _detect_overconfidence(self, answer: str) -> dict:
"""检测过度自信"""
overconfidence_markers = [
'绝对', '肯定', '一定', '毫无疑问',
'所有人', '所有人都会', '绝对不会',
'100%', '必然', '必定'
]
found_markers = []
for marker in overconfidence_markers:
if marker in answer:
found_markers.append(marker)
return {
'markers_found': found_markers,
'count': len(found_markers),
'is_suspicious': len(found_markers) > 2
}
def _check_logical_consistency(self, answer: str) -> float:
"""检查逻辑一致性"""
# 简化:检查因果关系是否合理
prompt = f"""分析以下文本的逻辑是否一致。
文本:
{answer}
检查:
1. 因果关系是否合理
2. 条件关系是否自洽
3. 数字是否前后一致
判断(0-1分,1表示完全一致):"""
try:
result = self.llm.generate(prompt).strip()
return float(result)
except:
return 0.5
def _calculate_hallucination_score(
self,
consistency: float,
uncertainty: dict,
overconfidence: dict,
logical_consistency: float
) -> float:
"""计算幻觉分数"""
# 基础分数
score = (consistency + logical_consistency) / 2
# 不确定性惩罚
if uncertainty['is_suspicious']:
score *= 0.9
# 过度自信惩罚
if overconfidence['is_suspicious']:
score *= 0.85
return max(0, min(1, score))
def _extract_statements(self, text: str) -> List[str]:
"""提取陈述"""
import re
# 按句子分割
sentences = re.split(r'[。!?\n]', text)
return [s.strip() for s in sentences if s.strip() and len(s) > 10]
def _get_risk_level(self, score: float) -> str:
"""获取风险等级"""
if score >= 0.8:
return "Low Risk"
elif score >= 0.6:
return "Medium Risk"
else:
return "High Risk"七、评估报告生成
class EvaluationReportGenerator:
"""评估报告生成器"""
def __init__(self, evaluator: RAGASEvaluator, detector: SelfRAGDetector):
self.evaluator = evaluator
self.detector = detector
def generate_report(
self,
test_cases: List[Dict]
) -> str:
"""生成完整评估报告"""
# 批量评估
batch_evaluator = BatchRAGASEvaluator(self.evaluator)
eval_results = batch_evaluator.evaluate_dataset(test_cases)
# 生成报告
report = f"""
# RAG系统评估报告
## 评估概览
- 测试用例总数:{eval_results['total_cases']}
- 成功评估:{eval_results['successful']}
- 失败评估:{eval_results['failed']}
## 核心指标
| 指标 | 平均分数 | 评估 |
|------|---------|------|
| Faithfulness(忠实度) | {eval_results['stats'].get('faithfulness', 0):.3f} | {'✓' if eval_results['stats'].get('faithfulness', 0) > 0.7 else '✗'} |
| Answer Relevance(相关性) | {eval_results['stats'].get('answer_relevance', 0):.3f} | {'✓' if eval_results['stats'].get('answer_relevance', 0) > 0.6 else '✗'} |
| Context Precision(精确度) | {eval_results['stats'].get('context_precision', 0):.3f} | {'✓' if eval_results['stats'].get('context_precision', 0) > 0.5 else '✗'} |
| Context Recall(召回率) | {eval_results['stats'].get('context_recall', 'N/A')} | - |
| Overall Score(综合分) | {eval_results['stats'].get('overall_score', 0):.3f} | - |
## 改进建议
{self._generate_recommendations(eval_results['stats'])}
---
*报告生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
"""
return report
def _generate_recommendations(self, stats: dict) -> str:
"""生成改进建议"""
recommendations = []
if stats.get('faithfulness', 0) < 0.7:
recommendations.append("1. **提升忠实度**:检查上下文是否充分支持生成内容,避免添加上下文外的信息。")
if stats.get('answer_relevance', 0) < 0.6:
recommendations.append("2. **提升答案相关性**:优化检索策略,确保检索到更相关的内容。")
if stats.get('context_precision', 0) < 0.5:
recommendations.append("3. **提升上下文精确度**:使用更精细的重排和过滤机制。")
if not recommendations:
recommendations.append("✓ 系统表现良好,继续保持当前策略。")
return "\n".join(recommendations)八、相关主题
九、参考文献
- Es, S., et al. (2023). RAGAS: Automated Evaluation of Retrieval Augmented Generation.
- Manakul, P., et al. (2023). SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection.
- Shi, W., et al. (2023). Filtering Automatic Retrieval with Chain-of-Thought Reasoning.
- Liu, Y., et al. (2023). DS-1000: A Natural Language-to-SQL Benchmark.