Skip to content

多模态RAG

多模态RAG是一种扩展的RAG技术,能够处理文本、图像、音频等多种模态的数据。通过整合不同模态的信息,多模态RAG可以提供更丰富、更准确的回答。本章节将详细介绍多模态RAG的概念、实现方法和应用场景。

1. 多模态RAG基础

核心概念

  • 多模态数据:包含文本、图像、音频、视频等多种类型的数据
  • 多模态嵌入:将不同模态的数据映射到同一向量空间
  • 跨模态检索:在不同模态之间进行检索,如用文本检索图像
  • 多模态融合:将不同模态的信息融合在一起

工作流程

  1. 多模态数据处理:处理和预处理不同模态的数据
  2. 多模态嵌入:将不同模态的数据转换为向量
  3. 跨模态检索:根据查询检索相关的多模态数据
  4. 多模态融合:融合不同模态的信息
  5. 生成回答:基于融合的信息生成回答

2. 多模态嵌入模型

主流模型

  • CLIP (Contrastive Language-Image Pretraining):OpenAI开发,能将文本和图像映射到同一向量空间
  • ALIGN:Google开发,通过大规模噪声对比学习实现跨模态对齐
  • Florence:微软开发,支持更细粒度的视觉-语言理解
  • ViLT (Vision-and-Language Transformer):端到端的视觉-语言预训练模型

CLIP模型示例

python
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel

# 加载CLIP模型
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 处理图像和文本
image_url = "https://example.com/cat.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
texts = ["a photo of a cat", "a photo of a dog", "a photo of a bird"]

# 生成嵌入
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)

# 获取图像和文本嵌入
image_embeds = outputs.image_embeds
text_embeds = outputs.text_embeds

# 计算相似度
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)

3. 多模态向量存储

python
import numpy as np
import faiss

class MultimodalVectorStore:
    def __init__(self, dimension):
        self.dimension = dimension
        self.text_index = faiss.IndexFlatIP(dimension)
        self.image_index = faiss.IndexFlatIP(dimension)
        self.text_metadata = []
        self.image_metadata = []
    
    def add_texts(self, texts, embeddings, metadata=None):
        """添加文本数据"""
        embeddings = np.array(embeddings).astype('float32')
        self.text_index.add(embeddings)
        for i, text in enumerate(texts):
            meta = metadata[i] if metadata else {}
            meta['text'] = text
            self.text_metadata.append(meta)
    
    def add_images(self, image_paths, embeddings, metadata=None):
        """添加图像数据"""
        embeddings = np.array(embeddings).astype('float32')
        self.image_index.add(embeddings)
        for i, path in enumerate(image_paths):
            meta = metadata[i] if metadata else {}
            meta['path'] = path
            self.image_metadata.append(meta)
    
    def search_text_by_text(self, query_embedding, k=5):
        """文本搜文本"""
        query_embedding = np.array([query_embedding]).astype('float32')
        distances, indices = self.text_index.search(query_embedding, k)
        return [self.text_metadata[i] for i in indices[0]]
    
    def search_image_by_text(self, query_embedding, k=5):
        """文本搜图像"""
        query_embedding = np.array([query_embedding]).astype('float32')
        distances, indices = self.image_index.search(query_embedding, k)
        return [self.image_metadata[i] for i in indices[0]]

4. 多模态RAG实现

python
class MultimodalRAG:
    def __init__(self):
        # 加载CLIP模型
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        
        # 初始化向量存储
        self.vector_store = MultimodalVectorStore(dimension=512)
        
        # 加载LLM
        self.llm = OpenAI()
    
    def embed_text(self, text):
        """嵌入文本"""
        inputs = self.clip_processor(text=text, return_tensors="pt", padding=True)
        outputs = self.clip_model.get_text_features(**inputs)
        return outputs.detach().numpy()[0]
    
    def embed_image(self, image_path):
        """嵌入图像"""
        image = Image.open(image_path)
        inputs = self.clip_processor(images=image, return_tensors="pt")
        outputs = self.clip_model.get_image_features(**inputs)
        return outputs.detach().numpy()[0]
    
    def index_documents(self, documents):
        """索引多模态文档"""
        for doc in documents:
            if doc['type'] == 'text':
                embedding = self.embed_text(doc['content'])
                self.vector_store.add_texts(
                    [doc['content']], 
                    [embedding], 
                    [doc.get('metadata', {})]
                )
            elif doc['type'] == 'image':
                embedding = self.embed_image(doc['path'])
                self.vector_store.add_images(
                    [doc['path']], 
                    [embedding], 
                    [doc.get('metadata', {})]
                )
    
    def query(self, question, retrieve_images=True):
        """执行多模态RAG查询"""
        # 嵌入查询
        query_embedding = self.embed_text(question)
        
        # 检索相关文本
        text_results = self.vector_store.search_text_by_text(query_embedding, k=3)
        
        # 检索相关图像
        image_results = []
        if retrieve_images:
            image_results = self.vector_store.search_image_by_text(query_embedding, k=2)
        
        # 构建多模态上下文
        context = self.build_multimodal_context(text_results, image_results)
        
        # 生成回答
        answer = self.generate_answer(question, context)
        
        return {
            "answer": answer,
            "text_sources": text_results,
            "image_sources": image_results
        }
    
    def build_multimodal_context(self, text_results, image_results):
        """构建多模态上下文"""
        context_parts = []
        
        # 添加文本上下文
        if text_results:
            context_parts.append("相关文本信息:")
            for i, result in enumerate(text_results):
                context_parts.append(f"{i+1}. {result['text'][:300]}...")
        
        # 添加图像描述
        if image_results:
            context_parts.append("\n相关图像:")
            for i, result in enumerate(image_results):
                # 使用图像描述模型生成描述
                description = self.describe_image(result['path'])
                context_parts.append(f"{i+1}. 图像描述: {description}")
        
        return "\n".join(context_parts)
    
    def describe_image(self, image_path):
        """生成图像描述"""
        # 可以使用BLIP等图像描述模型
        # 这里简化处理
        return f"图像: {image_path}"
    
    def generate_answer(self, question, context):
        """生成回答"""
        prompt = f"""基于以下多模态信息回答问题:

{context}

问题:{question}

请提供详细且准确的回答:"""
        
        return self.llm.generate(prompt)

5. 应用场景

电商产品搜索

python
class EcommerceMultimodalRAG(MultimodalRAG):
    def __init__(self):
        super().__init__()
    
    def search_products(self, query):
        """多模态产品搜索"""
        results = self.query(query, retrieve_images=True)
        
        # 整合产品和图像信息
        products = []
        for text in results['text_sources']:
            product_info = self.parse_product_info(text)
            products.append(product_info)
        
        return {
            "products": products,
            "related_images": results['image_sources']
        }

医学影像分析

python
class MedicalMultimodalRAG(MultimodalRAG):
    def __init__(self):
        super().__init__()
        # 加载医学专用模型
    
    def analyze_medical_case(self, description, image_path=None):
        """分析医学案例"""
        # 检索相似病例
        query_embedding = self.embed_text(description)
        
        # 检索文本病例
        text_cases = self.vector_store.search_text_by_text(query_embedding, k=5)
        
        # 如果有图像,检索相似影像
        image_cases = []
        if image_path:
            image_embedding = self.embed_image(image_path)
            image_cases = self.vector_store.search_image_by_image(image_embedding, k=3)
        
        # 综合分析
        analysis = self.generate_medical_analysis(
            description, text_cases, image_cases
        )
        
        return analysis

6. 性能优化

异步处理

python
import asyncio

class AsyncMultimodalRAG(MultimodalRAG):
    async def embed_image_async(self, image_path):
        """异步嵌入图像"""
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(None, self.embed_image, image_path)
    
    async def index_documents_async(self, documents):
        """异步索引文档"""
        tasks = []
        for doc in documents:
            if doc['type'] == 'text':
                tasks.append(self.embed_text_async(doc['content']))
            elif doc['type'] == 'image':
                tasks.append(self.embed_image_async(doc['path']))
        
        embeddings = await asyncio.gather(*tasks)
        # 批量添加到索引
        self.vector_store.add_batch(documents, embeddings)

缓存优化

python
from functools import lru_cache

class CachedMultimodalRAG(MultimodalRAG):
    @lru_cache(maxsize=1000)
    def embed_text(self, text):
        """缓存文本嵌入"""
        return super().embed_text(text)
    
    def embed_image(self, image_path):
        """缓存图像嵌入"""
        # 使用文件哈希作为缓存键
        import hashlib
        with open(image_path, 'rb') as f:
            file_hash = hashlib.md5(f.read()).hexdigest()
        
        cache_key = f"image_{file_hash}"
        # 检查缓存
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        embedding = super().embed_image(image_path)
        self.cache[cache_key] = embedding
        return embedding

7. 挑战与解决方案

模态对齐

不同模态的数据需要在同一向量空间中对齐:

python
def align_modalities(text_embeds, image_embeds, temperature=0.07):
    """对齐文本和图像嵌入"""
    # 归一化
    text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    
    # 计算相似度
    logits = torch.matmul(text_embeds, image_embeds.T) / temperature
    return logits

多模态融合策略

python
def fuse_multimodal_features(text_features, image_features, fusion_type='concat'):
    """融合多模态特征"""
    if fusion_type == 'concat':
        return torch.cat([text_features, image_features], dim=-1)
    elif fusion_type == 'attention':
        # 使用注意力机制融合
        attention_weights = compute_attention(text_features, image_features)
        return attention_weights * text_features + (1 - attention_weights) * image_features
    elif fusion_type == 'gate':
        # 门控融合
        gate = torch.sigmoid(torch.matmul(text_features, image_features.T))
        return gate * text_features + (1 - gate) * image_features