1. 面试题目 #
请详细阐述RAG(检索增强生成)的工作流程及其核心目的。请结合其主要阶段(建立索引、检索生成),说明每个阶段的关键步骤、所使用的技术组件以及RAG如何有效解决大型语言模型(LLM)的知识时效性和"幻觉"问题。
2. 参考答案 #
2.1 RAG的核心目的 #
RAG(Retrieval Augmented Generation,检索增强生成)的核心目的是将原始文档转化为AI可检索、可理解的格式,并通过引入外部知识来增强大型语言模型(LLM)的回答质量。它旨在解决LLM在仅依赖其预训练知识时可能出现的知识时效性不足和"幻觉"(生成不准确或虚构信息)问题。
2.2 RAG工作流程概述 #
RAG工作流程主要分为两大阶段:建立索引(Indexing)和检索生成(Retrieval Generation)。这两个阶段共同构成了一个完整的循环,使得LLM能够基于最新的、外部的、经过验证的信息来生成回答。
2.3 阶段一:建立索引(Indexing) #
此阶段主要负责将原始文档处理成可供检索的格式,类似于数据处理中的ETL(抽取、转换、加载)过程。
2.3.1 文档收集和切割(Document Collection and Chunking) #
2.3.1.1 收集(Extract): 从各种数据源(如PDF、Markdown文档、网页、数据库等)收集原始文档资料。
2.3.1.2 预处理与切割(Transform):
- 对收集到的原始文档进行清洗,去除无关信息(如HTML标签、广告内容)
- 统一文档格式
- 将篇幅较长的文档分割成更小、更易于处理的文本块(Chunk)
2.3.1.3 切割策略:
class DocumentChunker:
def __init__(self, chunk_size=1000, chunk_overlap=200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_document(self, document):
"""文档切割策略"""
# 1. 基于固定大小切割
fixed_chunks = self.fixed_size_chunking(document)
# 2. 基于语义边界切割
semantic_chunks = self.semantic_boundary_chunking(document)
# 3. 基于递归分割
recursive_chunks = self.recursive_chunking(document)
return self.optimize_chunks(fixed_chunks, semantic_chunks, recursive_chunks)
def fixed_size_chunking(self, text):
"""固定大小切割"""
chunks = []
for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
chunk = text[i:i + self.chunk_size]
chunks.append(chunk)
return chunks
def semantic_boundary_chunking(self, text):
"""基于语义边界切割"""
# 按句子分割
sentences = self.split_sentences(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= self.chunk_size:
current_chunk += sentence
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def recursive_chunking(self, text):
"""递归分割"""
separators = ["\n\n", "\n", ". ", "! ", "? ", " "]
for separator in separators:
if separator in text:
splits = text.split(separator)
if len(splits) > 1:
chunks = []
for split in splits:
if len(split) > self.chunk_size:
chunks.extend(self.recursive_chunking(split))
else:
chunks.append(split)
return chunks
return [text]2.3.2 向量转换和存储(Vector Transformation and Storage) #
2.3.2.1 向量转换(Vector Transformation): 使用专门的Embedding 模型(如BERT、Word2Vec、或更先进的Sentence Transformers等)将文本块转换为高维的数字向量。
class VectorTransformer:
def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
from sentence_transformers import SentenceTransformer
self.model = SentenceTransformer(model_name)
def transform_to_vectors(self, text_chunks):
"""将文本块转换为向量"""
vectors = self.model.encode(text_chunks)
return vectors
def create_embeddings(self, documents):
"""创建文档嵌入"""
embeddings = []
metadata = []
for doc in documents:
# 生成向量
vector = self.model.encode(doc['content'])
embeddings.append(vector)
# 保存元数据
metadata.append({
'content': doc['content'],
'source': doc.get('source', ''),
'timestamp': doc.get('timestamp', ''),
'author': doc.get('author', ''),
'chunk_id': doc.get('chunk_id', '')
})
return embeddings, metadata2.3.2.2 向量存储(Load): 将生成的向量、对应的原始文本块内容以及相关的元数据存入向量数据库。
class VectorStore:
def __init__(self, vector_db_type="faiss"):
if vector_db_type == "faiss":
import faiss
self.index = faiss.IndexFlatIP(384) # 假设向量维度为384
elif vector_db_type == "pinecone":
import pinecone
pinecone.init(api_key="your-api-key")
self.index = pinecone.Index("your-index-name")
def store_vectors(self, vectors, metadata):
"""存储向量和元数据"""
if hasattr(self.index, 'add'):
# Faiss
self.index.add(vectors)
self.metadata = metadata
else:
# Pinecone
for i, (vector, meta) in enumerate(zip(vectors, metadata)):
self.index.upsert([(f"chunk_{i}", vector, meta)])
def search_similar(self, query_vector, top_k=5):
"""搜索相似向量"""
if hasattr(self.index, 'search'):
# Faiss
scores, indices = self.index.search(query_vector.reshape(1, -1), top_k)
results = []
for score, idx in zip(scores[0], indices[0]):
results.append({
'score': float(score),
'metadata': self.metadata[idx]
})
return results
else:
# Pinecone
return self.index.query(query_vector, top_k=top_k)2.4 阶段二:检索生成(Retrieval Generation) #
此阶段负责根据用户问题检索相关信息,并利用这些信息增强LLM的回答。
2.4.1 文档过滤和检索(Document Filtering and Retrieval) #
2.4.1.1 查询处理(Query Processing): 将用户查询转换为向量形式。
class QueryProcessor:
def __init__(self, vector_transformer):
self.vector_transformer = vector_transformer
def process_query(self, query):
"""处理用户查询"""
# 查询预处理
processed_query = self.preprocess_query(query)
# 转换为向量
query_vector = self.vector_transformer.transform_to_vectors([processed_query])[0]
return {
'original_query': query,
'processed_query': processed_query,
'query_vector': query_vector
}
def preprocess_query(self, query):
"""查询预处理"""
# 清理查询
query = query.strip()
# 扩展查询(可选)
expanded_query = self.expand_query(query)
return expanded_query
def expand_query(self, query):
"""查询扩展"""
# 可以添加同义词、相关词等
synonyms = self.get_synonyms(query)
expanded = f"{query} {' '.join(synonyms)}"
return expanded2.4.1.2 相似度搜索(Similarity Search): 在向量数据库中进行相似度搜索。
class DocumentRetriever:
def __init__(self, vector_store, vector_transformer):
self.vector_store = vector_store
self.vector_transformer = vector_transformer
def retrieve_documents(self, query, top_k=5, filters=None):
"""检索相关文档"""
# 1. 处理查询
query_processor = QueryProcessor(self.vector_transformer)
processed_query = query_processor.process_query(query)
# 2. 相似度搜索
search_results = self.vector_store.search_similar(
processed_query['query_vector'],
top_k=top_k
)
# 3. 应用过滤条件
if filters:
search_results = self.apply_filters(search_results, filters)
# 4. 重排序
reranked_results = self.rerank_results(search_results, query)
return reranked_results
def apply_filters(self, results, filters):
"""应用过滤条件"""
filtered_results = []
for result in results:
metadata = result['metadata']
if self.matches_filters(metadata, filters):
filtered_results.append(result)
return filtered_results
def matches_filters(self, metadata, filters):
"""检查是否匹配过滤条件"""
for key, value in filters.items():
if key not in metadata:
return False
if metadata[key] != value:
return False
return True
def rerank_results(self, results, query):
"""重排序结果"""
# 可以使用更复杂的重排序模型
# 这里简单按分数排序
return sorted(results, key=lambda x: x['score'], reverse=True)2.4.2 查询增强和关联(Query Augmentation and Association) #
2.4.2.1 上下文组装(Context Assembly): 将检索到的文档与用户查询组合成增强提示词。
class ContextAssembler:
def __init__(self):
self.max_context_length = 4000 # 最大上下文长度
def assemble_context(self, query, retrieved_docs):
"""组装上下文"""
# 1. 提取文档内容
doc_contents = [doc['metadata']['content'] for doc in retrieved_docs]
# 2. 构建上下文
context = self.build_context(doc_contents)
# 3. 组装增强提示词
augmented_prompt = self.create_augmented_prompt(query, context)
return augmented_prompt
def build_context(self, doc_contents):
"""构建上下文"""
context_parts = []
current_length = 0
for i, content in enumerate(doc_contents):
if current_length + len(content) <= self.max_context_length:
context_parts.append(f"文档{i+1}: {content}")
current_length += len(content)
else:
break
return "\n\n".join(context_parts)
def create_augmented_prompt(self, query, context):
"""创建增强提示词"""
prompt = f"""
基于以下上下文信息回答问题:
上下文:
{context}
问题:{query}
请根据上下文信息回答问题,如果上下文中没有相关信息,请说明。
回答时请引用具体的文档来源。
"""
return prompt2.4.2.2 内容生成(Content Generation): 使用LLM生成最终回答。
class RAGGenerator:
def __init__(self, llm_client):
self.llm_client = llm_client
def generate_response(self, augmented_prompt):
"""生成回答"""
response = self.llm_client.generate(augmented_prompt)
return response
def generate_with_citations(self, query, retrieved_docs, augmented_prompt):
"""生成带引用的回答"""
# 生成回答
response = self.generate_response(augmented_prompt)
# 添加引用
citations = self.extract_citations(response, retrieved_docs)
return {
'response': response,
'citations': citations,
'sources': [doc['metadata']['source'] for doc in retrieved_docs]
}
def extract_citations(self, response, retrieved_docs):
"""提取引用"""
citations = []
for i, doc in enumerate(retrieved_docs):
if any(keyword in response.lower() for keyword in doc['metadata']['content'].lower().split()[:10]):
citations.append({
'doc_id': i,
'source': doc['metadata']['source'],
'relevance_score': doc['score']
})
return citations2.5 完整RAG系统实现 #
class RAGSystem:
def __init__(self, vector_db_type="faiss", embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
# 初始化组件
self.vector_transformer = VectorTransformer(embedding_model)
self.vector_store = VectorStore(vector_db_type)
self.document_retriever = DocumentRetriever(self.vector_store, self.vector_transformer)
self.context_assembler = ContextAssembler()
self.rag_generator = RAGGenerator(llm_client)
def index_documents(self, documents):
"""建立文档索引"""
# 1. 文档切割
chunker = DocumentChunker()
chunks = []
for doc in documents:
doc_chunks = chunker.chunk_document(doc['content'])
for i, chunk in enumerate(doc_chunks):
chunks.append({
'content': chunk,
'source': doc.get('source', ''),
'timestamp': doc.get('timestamp', ''),
'author': doc.get('author', ''),
'chunk_id': f"{doc.get('id', '')}_{i}"
})
# 2. 向量转换
embeddings, metadata = self.vector_transformer.create_embeddings(chunks)
# 3. 存储向量
self.vector_store.store_vectors(embeddings, metadata)
return len(chunks)
def query(self, question, top_k=5, filters=None):
"""查询处理"""
# 1. 检索相关文档
retrieved_docs = self.document_retriever.retrieve_documents(
question, top_k=top_k, filters=filters
)
# 2. 组装上下文
augmented_prompt = self.context_assembler.assemble_context(
question, retrieved_docs
)
# 3. 生成回答
result = self.rag_generator.generate_with_citations(
question, retrieved_docs, augmented_prompt
)
return result2.6 RAG如何解决LLM的问题 #
2.6.1 解决知识时效性问题 #
class KnowledgeUpdater:
def __init__(self, rag_system):
self.rag_system = rag_system
def update_knowledge_base(self, new_documents):
"""更新知识库"""
# 添加新文档到索引
self.rag_system.index_documents(new_documents)
# 可选:删除过时文档
self.remove_outdated_documents()
def remove_outdated_documents(self, cutoff_date):
"""删除过时文档"""
# 根据时间戳删除过时文档
pass2.6.2 缓解"幻觉"问题 #
class HallucinationDetector:
def __init__(self):
self.fact_checker = FactChecker()
def detect_hallucination(self, response, retrieved_docs):
"""检测幻觉"""
# 1. 检查回答是否基于检索到的文档
grounded_score = self.check_grounding(response, retrieved_docs)
# 2. 检查事实一致性
fact_score = self.fact_checker.check_facts(response, retrieved_docs)
# 3. 综合评分
hallucination_score = 1 - (grounded_score * 0.7 + fact_score * 0.3)
return {
'hallucination_score': hallucination_score,
'is_hallucinated': hallucination_score > 0.5,
'grounded_score': grounded_score,
'fact_score': fact_score
}
def check_grounding(self, response, retrieved_docs):
"""检查回答是否基于检索文档"""
# 简单的关键词匹配检查
doc_keywords = set()
for doc in retrieved_docs:
doc_keywords.update(doc['metadata']['content'].lower().split())
response_keywords = set(response.lower().split())
overlap = len(doc_keywords.intersection(response_keywords))
return overlap / len(response_keywords) if response_keywords else 02.7 实际应用示例 #
# 使用示例
def main():
# 初始化RAG系统
rag = RAGSystem()
# 1. 建立索引
documents = [
{
'content': '人工智能是计算机科学的一个分支...',
'source': 'AI_handbook.pdf',
'timestamp': '2024-01-15',
'author': '张三'
},
{
'content': '机器学习是人工智能的核心技术...',
'source': 'ML_guide.pdf',
'timestamp': '2024-01-20',
'author': '李四'
}
]
rag.index_documents(documents)
# 2. 查询
question = "什么是人工智能?"
result = rag.query(question, top_k=3)
print(f"问题: {question}")
print(f"回答: {result['response']}")
print(f"来源: {result['sources']}")
print(f"引用: {result['citations']}")
if __name__ == "__main__":
main()2.8 总结 #
RAG通过以下方式有效解决LLM的问题:
- 解决知识时效性: 通过实时或定期更新向量数据库,确保LLM能够参考最新信息
- 缓解"幻觉"问题: 强制LLM基于检索到的真实信息进行回答,减少虚构内容
- 增强可解释性: 通过源引用,用户可以验证信息的真实性
- 提高准确性: 基于外部知识库的检索,提供更准确、更具体的回答
RAG系统通过建立索引和检索生成两个阶段,构建了一个完整的知识增强框架,使LLM能够更好地服务于实际应用场景。