摘要
多轮对话是AI应用的核心场景,但对话历史越来越长怎么办?这篇专门为零基础读者讲解:对话历史管理的核心挑战、三大策略(完整历史/摘要历史/固定窗口)、重要性加权、遗忘机制,以及长期记忆与短期记忆怎么配合。看完你就能做出一个”不会失忆”的AI助手了。
先思考一个问题:AI为什么会”失忆”?
想象你和朋友的对话
你和一个朋友聊天:
第1轮:
你:你好,我叫小明
朋友:你好小明,很高兴认识你!
第10轮:
你:对了,我之前说的那件事...
朋友:什么事?你没说过啊?
原因:朋友记不住10轮前的对话了!
AI的”失忆”原因
AI的失忆不是因为它笨,而是因为上下文窗口有限:
假设上下文窗口 = 1000 tokens
对话历史:
第1轮:100 tokens ✓ (还能记住)
第2轮:100 tokens ✓ (还能记住)
第3轮:100 tokens ✓ (还能记住)
...
第10轮:100 tokens → 总共1000 tokens → 满了!
第11轮:100 tokens → 超出限制!
→ 必须截断旧的历史 → 失忆!
对话历史管理的核心挑战
| 挑战 | 影响 | 怎么办 |
|---|---|---|
| 窗口限制 | 历史太长装不下 | 摘要/截断 |
| 注意力稀释 | 早期信息被淹没 | 重要性加权 |
| 成本累积 | 历史越长越贵 | 定期清理 |
| 相关性衰减 | 旧对话与当前任务无关 | 选择性遗忘 |
一、三大管理策略对比
策略一览
| 策略 | 原理 | 优点 | 缺点 | 适用场景 |
|---|---|---|---|---|
| 完整历史 | 保存所有对话 | 信息完整 | 成本高、会超限 | 短对话 |
| 摘要历史 | 定期压缩成摘要 | 节省空间 | 可能丢细节 | 长对话 |
| 固定窗口 | 只保留最近N轮 | 简单 | 可能丢重要信息 | 简单场景 |
策略1:完整历史(最简单)
class FullHistoryManager:
"""完整历史管理器 - 适合短对话"""
def __init__(self, max_context_tokens: int = 50000):
self.max_context_tokens = max_context_tokens
self.history = []
def add_message(self, role: str, content: str):
"""添加消息"""
self.history.append({
'role': role,
'content': content
})
def get_context(self) -> List[dict]:
"""获取适合上下文的对话历史"""
total_tokens = 0
context_messages = []
# 从最新消息开始,往前加,直到装满
for msg in reversed(self.history):
msg_tokens = self._estimate_tokens(msg['content'])
if total_tokens + msg_tokens > self.max_context_tokens:
break
context_messages.insert(0, msg) # 保持顺序
total_tokens += msg_tokens
return context_messages
@staticmethod
def _estimate_tokens(text: str) -> int:
"""估算token"""
chinese = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
english = len(text.split()) - chinese
return int(chinese * 0.5 + english * 0.25)策略2:摘要历史(最常用)
class SummarizedHistoryManager:
"""摘要历史管理器 - 适合长对话"""
def __init__(
self,
llm_client,
max_history_tokens: int = 8000,
summary_trigger_tokens: int = 5000
):
self.llm = llm_client
self.max_history_tokens = max_history_tokens
self.summary_trigger_tokens = summary_trigger_tokens
# 当前对话(未摘要)
self.current_history = []
# 历史摘要
self.summary = None
def add_message(self, role: str, content: str):
"""添加消息"""
self.current_history.append({
'role': role,
'content': content
})
# 检查是否需要摘要
if self._total_tokens() > self.summary_trigger_tokens:
self._trigger_summarization()
def _trigger_summarization(self):
"""触发摘要 - 把前半部分对话压缩成摘要"""
if len(self.current_history) < 4:
return # 对话太短,不摘要
# 取前半部分
to_summarize = self.current_history[:len(self.current_history)//2]
# 调用LLM生成摘要
summary_text = self._generate_summary(to_summarize)
# 更新状态
self.summary = summary_text
self.current_history = self.current_history[len(to_summarize):]
def _generate_summary(self, messages: list) -> str:
"""生成摘要"""
prompt = f"""请总结以下对话的核心内容:
{self._format_messages(messages)}
要求:
1. 保留关键话题和决定
2. 保留重要的用户需求
3. 保留AI提供的解决方案
4. 删除重复和细节
5. 控制在200字以内"""
return self.llm.generate(prompt).strip()
def get_context(self) -> str:
"""获取完整上下文"""
parts = []
if self.summary:
parts.append(f"[早期对话摘要]\n{self.summary}\n")
for msg in self.current_history:
parts.append(f"{msg['role']}: {msg['content']}")
return "\n".join(parts)
def _format_messages(self, messages: list) -> str:
return "\n".join([f"{m['role']}: {m['content']}" for m in messages])
def _total_tokens(self) -> int:
total = 0
if self.summary:
total += self._estimate_tokens(self.summary)
for msg in self.current_history:
total += self._estimate_tokens(msg['content'])
return total
@staticmethod
def _estimate_tokens(text: str) -> int:
chinese = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
english = len(text.split()) - chinese
return int(chinese * 0.5 + english * 0.25)策略3:固定窗口摘要(带重叠)
class FixedWindowHistoryManager:
"""固定窗口历史管理器 - 保留最近+历史摘要"""
def __init__(
self,
llm_client,
window_size: int = 6000,
overlap_tokens: int = 500
):
self.llm = llm_client
self.window_size = window_size
self.overlap_tokens = overlap_tokens
self.messages = [] # 当前消息
self.summaries = [] # 历史摘要
def add_message(self, role: str, content: str):
"""添加消息"""
self.messages.append({'role': role, 'content': content})
# 检查是否需要摘要
if self._total_tokens() > self.window_size:
self._summarize_old()
def _summarize_old(self):
"""摘要旧消息"""
# 找到需要摘要的起始位置
total = 0
split_index = 0
for i, msg in enumerate(self.messages):
total += self._estimate_tokens(msg['content'])
if total > self.window_size:
split_index = i
break
if split_index == 0:
return
# 摘要前半部分
old_messages = self.messages[:split_index]
old_summary = self.llm.summarize(self._format_messages(old_messages))
self.summaries.append(old_summary)
# 保留末尾部分+重叠
overlap = self._get_overlap(old_messages)
self.messages = overlap + self.messages[split_index:]
def _get_overlap(self, old_messages: list) -> list:
"""获取重叠消息"""
overlap_size = 0
overlap = []
for msg in reversed(old_messages):
msg_tokens = self._estimate_tokens(msg['content'])
if overlap_size + msg_tokens > self.overlap_tokens:
break
overlap.insert(0, msg)
overlap_size += msg_tokens
return overlap
def get_context(self) -> str:
"""获取上下文"""
parts = []
if self.summaries:
parts.append("[历史对话摘要]")
for i, s in enumerate(self.summaries):
parts.append(f"阶段{i+1}: {s}")
parts.append("")
for msg in self.messages:
parts.append(f"{msg['role']}: {msg['content']}")
return "\n".join(parts)
def _format_messages(self, messages: list) -> str:
return "\n".join([f"{m['role']}: {m['content']}" for m in messages])
def _total_tokens(self) -> int:
return sum(self._estimate_tokens(m['content']) for m in self.messages)
@staticmethod
def _estimate_tokens(text: str) -> int:
chinese = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
english = len(text.split()) - chinese
return int(chinese * 0.5 + english * 0.25)二、重要性加权:哪些该记住?
不是所有消息都一样重要
对话记录:
1. 用户:"我叫小明,请记住" ← 重要!(用户偏好)
2. AI:"好的,我记住了" ← 重要!(确认)
3. 用户:"今天天气不错" ← 不重要(闲聊)
4. AI:"是啊,阳光明媚" ← 不重要(闲聊)
5. 用户:"帮我写一个登录功能" ← 重要!(任务)
6. AI:"好的,这是代码..." ← 重要!(结果)
重要性评分系统
class ImportanceScorer:
"""消息重要性评分"""
def __init__(self):
# 关键词触发重要性
self.important_keywords = [
'记住', '重要', '关键', '不要', '确认',
'我的名字', '偏好', '叫我'
]
self.noise_keywords = [
'顺便', '另外', '对了', '哦', '对了对了'
]
def score(self, role: str, content: str) -> float:
"""
评分 0-1
1.0 = 必须记住
0.5 = 普通消息
0.1 = 可以丢弃
"""
score = 0.5 # 基础分数
# 用户消息权重更高
if role == 'user':
score += 0.1
# 检查重要关键词
for kw in self.important_keywords:
if kw in content:
score = min(score + 0.3, 1.0)
# 检查噪声关键词
for kw in self.noise_keywords:
if kw in content:
score = max(score - 0.2, 0.1)
# 包含代码或数据,更重要
if '```' in content or '```' in content:
score = min(score + 0.2, 1.0)
# 包含数字/日期,更重要
if any(c.isdigit() for c in content):
score = min(score + 0.1, 1.0)
return score
class WeightedHistoryManager:
"""重要性加权的对话历史管理"""
def __init__(self, llm_client, max_tokens: int = 8000):
self.llm = llm_client
self.max_tokens = max_tokens
self.messages = []
self.scorer = ImportanceScorer()
def add_message(self, role: str, content: str):
"""添加消息并计算重要性"""
importance = self.scorer.score(role, content)
self.messages.append({
'role': role,
'content': content,
'importance': importance,
'timestamp': len(self.messages)
})
# 检查是否需要摘要
if self._total_tokens() > self.max_tokens:
self._smart_compress()
def _smart_compress(self):
"""智能压缩 - 低重要性消息优先丢弃"""
# 按重要性排序,优先保留重要的
sorted_msgs = sorted(
self.messages,
key=lambda x: x['importance'],
reverse=True
)
# 保留重要的,摘要中间部分
important = [m for m in sorted_msgs if m['importance'] > 0.7]
rest = [m for m in sorted_msgs if m['importance'] <= 0.7]
if len(important) < len(self.messages) * 0.3:
# 如果重要消息太少,降低阈值
important = sorted_msgs[:len(sorted_msgs)//3]
rest = sorted_msgs[len(sorted_msgs)//3:]
# 摘要rest部分
if rest:
summary = self.llm.summarize(
self._format_messages(rest)
)
important.append({
'role': 'system',
'content': f"[早期对话摘要] {summary}",
'importance': 0.6,
'timestamp': 0
})
self.messages = important
# 按时间排序
self.messages.sort(key=lambda x: x['timestamp'])
def get_context(self) -> str:
"""获取上下文"""
return self._format_messages(self.messages)
def _format_messages(self, messages: list) -> str:
return "\n".join([f"{m['role']}: {m['content']}" for m in messages])
def _total_tokens(self) -> int:
return sum(self._estimate_tokens(m['content']) for m in self.messages)
@staticmethod
def _estimate_tokens(text: str) -> int:
chinese = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
english = len(text.split()) - chinese
return int(chinese * 0.5 + english * 0.25)三、遗忘机制:怎么让AI”选择性遗忘”
人脑的遗忘机制
人脑不会记住所有事情,而是:
- 经常用的记得牢
- 很久不用的会忘记
- 重要的事情记得久
AI也可以模拟这种机制!
基于时间的遗忘
import time
from datetime import datetime, timedelta
class TimeBasedForgetting:
"""基于时间的遗忘"""
def __init__(
self,
max_age_hours: int = 24,
decay_rate: float = 0.1
):
self.max_age_hours = max_age_hours
self.decay_rate = decay_rate
self.messages = []
def add_message(self, role: str, content: str):
"""添加消息"""
self.messages.append({
'role': role,
'content': content,
'created_at': datetime.now(),
'last_accessed': datetime.now(),
'access_count': 0
})
def access(self, message_id: int):
"""访问消息(更新访问时间)"""
for msg in self.messages:
if id(msg) == message_id:
msg['last_accessed'] = datetime.now()
msg['access_count'] += 1
break
def get_active_messages(self) -> list:
"""获取活跃消息"""
now = datetime.now()
active = []
for msg in self.messages:
age = now - msg['created_at']
# 完全过期的消息,删除
if age > timedelta(hours=self.max_age_hours * 2):
self.messages.remove(msg)
continue
# 计算衰减后的重要性
decay = self._calculate_decay(msg, now)
if decay > 0.3: # 保留阈值
msg['decay'] = decay
active.append(msg)
return active
def _calculate_decay(self, msg: dict, now: datetime) -> float:
"""计算衰减因子"""
age = now - msg['created_at']
hours = age.total_seconds() / 3600
# 时间衰减
time_decay = max(0, 1 - hours * self.decay_rate / self.max_age_hours)
# 访问增强(被访问越多,越不容易遗忘)
access_boost = min(msg['access_count'] * 0.1, 0.3)
return min(time_decay + access_boost, 1.0)基于重要性的遗忘
class ImportanceBasedForgetting:
"""基于重要性的遗忘"""
def __init__(
self,
max_messages: int = 50,
min_importance: float = 0.2
):
self.max_messages = max_messages
self.min_importance = min_importance
self.messages = []
self.key_points = [] # 保存关键点
def add_message(self, role: str, content: str, importance: float = 0.5):
"""添加消息"""
self.messages.append({
'role': role,
'content': content,
'importance': importance
})
self._forget_if_needed()
def _forget_if_needed(self):
"""必要时遗忘"""
if len(self.messages) <= self.max_messages:
return
# 按重要性排序
sorted_msgs = sorted(
self.messages,
key=lambda x: x['importance'],
reverse=True
)
# 保留最重要的
self.messages = sorted_msgs[:self.max_messages]
# 保存被遗忘消息的关键点
forgotten = sorted_msgs[self.max_messages:]
for msg in forgotten:
# 提取关键点(简化:保留前50字)
key_point = msg['content'][:50]
if key_point not in self.key_points:
self.key_points.append(key_point)
def get_context(self) -> str:
"""获取上下文"""
parts = []
# 添加保留的关键点
if self.key_points:
parts.append("[之前提到的]")
for point in self.key_points[-5:]: # 最多5个
parts.append(f"- {point}")
parts.append("")
# 添加当前消息
for msg in self.messages:
parts.append(f"{msg['role']}: {msg['content']}")
return "\n".join(parts)四、长期记忆与短期记忆整合
人脑的记忆分层
┌─────────────────────────────────────────────────┐
│ 长期记忆 (Long-term) │
│ - 你的名字、偏好、习惯 │
│ - 重要的事实和知识 │
│ - 跨会话的上下文 │
├─────────────────────────────────────────────────┤
│ 短期记忆 (Short-term) │
│ - 当前对话的内容 │
│ - 进行中的任务 │
│ - 临时需要的参考 │
├─────────────────────────────────────────────────┤
│ 工作记忆 (Working) │
│ - 当前正在处理的信息 │
│ - 即时的思考焦点 │
└─────────────────────────────────────────────────┘
整合实现
class IntegratedMemoryManager:
"""整合的记忆管理器"""
def __init__(self, llm_client, storage=None):
self.llm = llm_client
self.storage = storage # 长期存储(文件/数据库)
# 短期记忆
self.short_term = []
# 工作记忆
self.working_memory = {
'current_task': None,
'focus': None
}
def add_to_short_term(self, role: str, content: str):
"""添加到短期记忆"""
self.short_term.append({
'role': role,
'content': content,
'timestamp': datetime.now()
})
# 检查是否需要提取到长期记忆
self._check_long_term_extraction()
def _check_long_term_extraction(self):
"""检查是否需要提取到长期记忆"""
if len(self.short_term) < 10:
return
# 检查是否有重要信息
recent = self.short_term[-5:]
prompt = f"""分析以下对话,判断是否有需要长期记住的信息。
{self._format_dialogue(recent)}
如果有关键信息(如用户偏好、重要决定),输出需要记住的要点。
否则输出"无需保留"。"""
result = self.llm.generate(prompt).strip()
if "无需保留" not in result:
# 保存到长期记忆
self._save_to_long_term(result)
# 清理短期记忆(保留最近3条)
self.short_term = self.short_term[-3:]
def _save_to_long_term(self, summary: str):
"""保存到长期记忆"""
if self.storage:
self.storage.save('long_term_memory', {
'summary': summary,
'timestamp': datetime.now().isoformat()
})
def get_long_term_memory(self, query: str = "") -> str:
"""获取长期记忆"""
if not self.storage:
return ""
memory = self.storage.load('long_term_memory')
if not memory:
return ""
# 如果有查询,过滤相关内容
if query:
# 简单关键词匹配
if any(kw in memory.get('summary', '') for kw in query.split()[:3]):
return memory.get('summary', '')
return ""
return memory.get('summary', '')
def update_working_memory(self, task: str = None, focus: str = None):
"""更新工作记忆"""
if task:
self.working_memory['current_task'] = task
if focus:
self.working_memory['focus'] = focus
def get_full_context(self, current_query: str = "") -> str:
"""获取完整上下文"""
parts = []
# 1. 工作记忆
if self.working_memory['current_task']:
parts.append(f"[当前任务] {self.working_memory['current_task']}")
if self.working_memory['focus']:
parts.append(f"[当前焦点] {self.working_memory['focus']}")
# 2. 长期记忆
long_term = self.get_long_term_memory(current_query)
if long_term:
parts.append(f"[用户背景]\n{long_term}")
# 3. 短期记忆
if self.short_term:
parts.append("[近期对话]")
for msg in self.short_term:
parts.append(f"{msg['role']}: {msg['content']}")
return "\n\n".join(parts)
def _format_dialogue(self, messages: list) -> str:
return "\n".join([f"{m['role']}: {m['content']}" for m in messages])五、生产级对话管理
完整实现
class ProductionConversationManager:
"""生产级对话管理器"""
def __init__(
self,
llm_client,
max_tokens: int = 80000,
summary_threshold: int = 30000
):
self.llm = llm_client
self.max_tokens = max_tokens
self.summary_threshold = summary_threshold
# 历史摘要
self.summary_history = []
# 当前消息
self.current_messages = []
# 摘要器
self.summarizer = Summarizer(llm_client)
def add_message(self, role: str, content: str):
"""添加消息"""
message = {
'role': role,
'content': content,
'timestamp': datetime.now().isoformat()
}
self.current_messages.append(message)
# 检查是否需要摘要
self._check_summarization()
def _check_summarization(self):
"""检查是否需要摘要"""
total = self._total_tokens()
if total > self.max_tokens:
self._perform_summarization()
elif total > self.summary_threshold:
self._progressive_summarize()
def _perform_summarization(self):
"""执行摘要 - 保留最近1/3"""
keep_count = len(self.current_messages) // 3
to_summarize = self.current_messages[:-keep_count]
summary = self.llm.summarize(self._format_dialogue(to_summarize))
self.summary_history.append(summary)
self.current_messages = self.current_messages[-keep_count:]
def _progressive_summarize(self):
"""渐进式摘要"""
# 只摘要最旧的一半
to_summarize = self.current_messages[:len(self.current_messages)//2]
summary = self.llm.summarize(self._format_dialogue(to_summarize))
self.summary_history.append(summary)
self.current_messages = self.current_messages[len(to_summarize):]
def get_context(self) -> str:
"""获取完整上下文"""
parts = []
# 1. 历史摘要
if self.summary_history:
parts.append("[早期对话摘要]")
for i, s in enumerate(self.summary_history):
parts.append(f"阶段{i+1}: {s}")
parts.append("")
# 2. 当前消息
for msg in self.current_messages:
parts.append(f"{msg['role']}: {msg['content']}")
return "\n\n".join(parts)
def _format_dialogue(self, messages: list) -> str:
return "\n".join([f"{m['role']}: {m['content']}" for m in messages])
def _total_tokens(self) -> int:
total = 0
for s in self.summary_history:
total += self._estimate_tokens(s)
for m in self.current_messages:
total += self._estimate_tokens(m['content'])
return total
@staticmethod
def _estimate_tokens(text: str) -> int:
chinese = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
english = len(text.split()) - chinese
return int(chinese * 0.5 + english * 0.25)六、实战调用示例
# 使用对话管理器的完整示例
def chatbot_example():
"""对话机器人示例"""
# 初始化(根据场景选择策略)
manager = ProductionConversationManager(
llm_client=llm,
max_tokens=80000,
summary_threshold=30000
)
# 模拟对话
conversation = [
("user", "我叫小明,帮我做一个用户管理系统"),
("assistant", "好的小明,用户管理系统需要哪些功能?"),
("user", "需要用户注册、登录、权限管理"),
("assistant", "明白,以下是基本架构..."),
("user", "好的,用Django实现吧"),
("assistant", "以下是Django实现..."),
("user", "登录功能需要支持微信登录"),
("assistant", "微信登录可以这样实现..."),
# ... 假设对话继续50轮
]
# 添加消息
for role, content in conversation:
manager.add_message(role, content)
# 当前问题
current_question = "我之前说的系统,支持哪些登录方式?"
# 获取上下文
context = manager.get_context()
# 调用LLM
prompt = f"""{context}
用户问题:{current_question}
请基于对话历史回答。"""
response = llm.generate(prompt)
print(f"问题:{current_question}")
print(f"回答:{response}")七、一图总结
┌─────────────────────────────────────────────────────────────┐
│ 对话历史管理速查表 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 📊 三大策略 │
│ ├─ 完整历史:适合短对话(<10轮) │
│ ├─ 摘要历史:适合长对话(定期压缩) │
│ └─ 固定窗口:适合简单场景(只留最近) │
│ │
│ ⚖️ 重要性加权 │
│ ├─ 用户消息 > AI消息 │
│ ├─ 有关键词(记住、重要)→ 更重要 │
│ └─ 包含代码/数据 → 更重要 │
│ │
│ 🧠 遗忘机制 │
│ ├─ 时间遗忘:越久远越容易忘 │
│ ├─ 重要性遗忘:低重要性的优先丢弃 │
│ └─ 提取关键点:被遗忘的内容提取摘要 │
│ │
│ 🏗️ 记忆分层 │
│ ├─ 长期记忆:用户偏好、重要事实 │
│ ├─ 短期记忆:当前对话 │
│ └─ 工作记忆:当前任务 │
│ │
│ 💡 最佳实践 │
│ ├─ 根据对话长度选择策略 │
│ ├─ 优先保留用户偏好和关键决定 │
│ └─ 定期清理,避免成本失控 │
│ │
└─────────────────────────────────────────────────────────────┘
相关主题
- 上下文窗口深度解析 - 窗口限制是根本原因
- 上下文压缩技术 - 摘要技术的深入讲解
- RAG上下文优化指南 - 对话也是一种检索场景
参考文献
- Miller, A. (2024). Retrieval-Augmented Generation for Conversational AI.
- Xu, S., et al. (2024). MemoRAG: Moving towards Next-Gen RAG via Memory-Augmented Generation.
- Lewis, P., et al. (2020). Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. NeurIPS.