理解文本嵌入的概念、原理和应用,掌握将文本转换为向量表示的核心技术
# 词向量的语义运算 embedding("国王") - embedding("男人") + embedding("女人") ≈ embedding("女王") embedding("北京") - embedding("中国") + embedding("日本") ≈ embedding("东京")
import numpy as np def cosine_similarity(vec1, vec2): """计算两个向量的余弦相似度""" dot_product = np.dot(vec1, vec2) norm_product = np.linalg.norm(vec1) * np.linalg.norm(vec2) return dot_product / norm_product # 示例 sim_score = cosine_similarity( embedding("人工智能"), embedding("机器学习") ) # 结果接近0.85,表示高度相关
from openai import OpenAI client = OpenAI() def get_embedding(text, model="text-embedding-3-small", dimensions=None): """获取文本嵌入向量""" params = { "model": model, "input": text } # 新功能:动态调整维度 if dimensions: params["dimensions"] = dimensions response = client.embeddings.create(**params) return response.data[0].embedding # 使用示例 embedding_full = get_embedding("人工智能的未来") # 默认1536维 embedding_small = get_embedding("人工智能的未来", dimensions=512) # 降至512维
from sentence_transformers import SentenceTransformer # 加载BGE-M3模型 model = SentenceTransformer('BAAI/bge-m3') # 批量编码 texts = [ "人工智能正在改变世界", "机器学习是AI的核心技术", "今天天气真好" ] embeddings = model.encode(texts, normalize_embeddings=True) print(f"嵌入维度:{embeddings.shape}") # (3, 1024) # 计算相似度 similarities = embeddings @ embeddings.T print(f"文本1和2的相似度:{similarities[0][1]:.4f}") # 约0.85 print(f"文本1和3的相似度:{similarities[0][2]:.4f}") # 约0.20
class SemanticSearch: def __init__(self, embedding_model="text-embedding-3-small"): self.model = embedding_model self.document_embeddings = [] self.documents = [] def index_documents(self, documents): """为文档建立嵌入索引""" for doc in documents: embedding = get_embedding(doc, self.model) self.document_embeddings.append(embedding) self.documents.append(doc) def search(self, query, top_k=5): """语义搜索""" query_embedding = get_embedding(query, self.model) # 计算相似度 similarities = [] for idx, doc_embedding in enumerate(self.document_embeddings): sim = cosine_similarity(query_embedding, doc_embedding) similarities.append((idx, sim)) # 返回最相似的文档 similarities.sort(key=lambda x: x[1], reverse=True) return [(self.documents[idx], score) for idx, score in similarities[:top_k]]
def enhance_prompt_with_context(query, knowledge_base): """使用嵌入检索增强提示词""" # 1. 获取查询嵌入 query_embedding = get_embedding(query) # 2. 从知识库检索相关内容 relevant_docs = knowledge_base.search(query_embedding, top_k=3) # 3. 构建增强的提示词 context = "\n".join([doc for doc, _ in relevant_docs]) enhanced_prompt = f""" 基于以下相关信息回答问题: {context} 问题:{query} """ return enhanced_prompt
def optimize_embedding_dimensions(texts, test_queries): """测试不同维度的性能""" dimensions = [256, 512, 1024, 1536, 3072] results = {} for dim in dimensions: # 生成嵌入 embeddings = [get_embedding(t, dimensions=dim) for t in texts] # 评估检索质量 avg_precision = evaluate_retrieval(embeddings, test_queries) # 计算成本 storage_cost = calculate_storage_cost(len(texts), dim) results[dim] = { "precision": avg_precision, "storage_mb": storage_cost, "cost_efficiency": avg_precision / storage_cost } return results
def batch_encode(texts, batch_size=100): """批量处理大规模文本""" all_embeddings = [] for i in range(0, len(texts), batch_size): batch = texts[i:i + batch_size] # API支持批量请求 response = client.embeddings.create( model="text-embedding-3-small", input=batch ) batch_embeddings = [item.embedding for item in response.data] all_embeddings.extend(batch_embeddings) return all_embeddings