大语言模型应用的成本控制策略和优化技术
模型 | 输入价格 | 输出价格 | 特点 |
---|---|---|---|
Gemini 1.5 Flash-8B | $0.0375/百万 | $0.15/百万 | 最便宜,适合简单任务 |
GPT-4o-mini | $0.15/百万 | $0.60/百万 | 性价比高,功能均衡 |
Claude 3 Haiku | $0.25/百万 | $1.25/百万 | 响应快,适合实时场景 |
Mistral Small | $0.40/百万 | $1.20/百万 | 开源友好,欧洲合规 |
模型 | 输入价格 | 输出价格 | 特点 |
---|---|---|---|
Claude 3.5 Sonnet | $3/百万 | $15/百万 | 编程能力强 |
GPT-4o | $10/百万 | $30/百万 | 综合能力优秀 |
Gemini 1.5 Pro | $0.10/百万 | $0.40/百万 | 超长上下文 |
模型 | 输入价格 | 输出价格 | 特点 |
---|---|---|---|
Claude 4 Opus | $15/百万 | $75/百万 | 推理能力最强 |
GPT-4 (128K) | $60/百万 | $120/百万 | 长文本处理 |
Gemini Ultra | 定制报价 | 定制报价 | 企业级方案 |
import hashlib
import redis
from typing import Optional, Dict, Any
class MultiLevelCache:
def __init__(self):
self.memory_cache = {} # L1: 内存缓存
self.redis_client = redis.Redis() # L2: Redis缓存
self.cache_stats = {"hits": 0, "misses": 0}
def get_cache_key(self, prompt: str, params: Dict[str, Any]) -> str:
"""生成缓存键"""
content = f"{prompt}:{sorted(params.items())}"
return hashlib.md5(content.encode()).hexdigest()
async def get_or_generate(self, prompt: str, params: Dict[str, Any]):
cache_key = self.get_cache_key(prompt, params)
# L1: 检查内存缓存
if cache_key in self.memory_cache:
self.cache_stats["hits"] += 1
return self.memory_cache[cache_key]
# L2: 检查Redis缓存
cached = self.redis_client.get(cache_key)
if cached:
self.cache_stats["hits"] += 1
result = json.loads(cached)
self.memory_cache[cache_key] = result
return result
# 缓存未命中,生成新响应
self.cache_stats["misses"] += 1
result = await generate_llm_response(prompt, params)
# 写入多级缓存
self.memory_cache[cache_key] = result
self.redis_client.setex(
cache_key,
3600, # 1小时过期
json.dumps(result)
)
return result
def get_hit_rate(self) -> float:
total = self.cache_stats["hits"] + self.cache_stats["misses"]
return self.cache_stats["hits"] / total if total > 0 else 0
def optimize_prompt(original_prompt: str) -> str:
"""提示词优化,减少50%+ tokens"""
# 移除冗余空格和换行
prompt = " ".join(original_prompt.split())
# 简化指令
replacements = {
"请你帮我": "",
"我想要你": "",
"能否请你": "",
"非常详细地": "详细",
"尽可能地": ""
}
for old, new in replacements.items():
prompt = prompt.replace(old, new)
# 使用缩写
abbreviations = {
"例如": "如",
"但是": "但",
"因此": "故",
"所以": "故"
}
for full, abbr in abbreviations.items():
prompt = prompt.replace(full, abbr)
return prompt
# 示例:1200 tokens → 600 tokens
original = """
请你帮我非常详细地分析一下这个问题,
我想要你从多个角度来考虑,
并且尽可能地给出具体的例子。
"""
optimized = "详细分析此问题,多角度考虑,给出具体例子。"
from llmlingua import PromptCompressor
compressor = PromptCompressor(
model_name="gpt2",
device_map="cpu",
use_llmlingua2=True
)
# 20倍压缩率,保留95%语义信息
compressed_prompt = compressor.compress_prompt(
original_prompt,
rate=0.05, # 压缩到5%
force_tokens=["关键词1", "关键词2"] # 强制保留
)
class ModelRouter:
def __init__(self):
self.rules = {
"simple_qa": "gpt-4o-mini",
"translation": "gemini-flash-8b",
"code_generation": "claude-3.5-sonnet",
"complex_reasoning": "claude-4-opus",
"creative_writing": "gpt-4",
"data_extraction": "gpt-4o-mini"
}
def select_model(self, task_type: str, complexity: float) -> str:
"""根据任务类型和复杂度选择模型"""
# 简单任务直接使用便宜模型
if complexity < 0.3:
return "gemini-flash-8b"
# 中等复杂度查找规则
if complexity < 0.7:
return self.rules.get(task_type, "gpt-4o-mini")
# 高复杂度使用高端模型
return self.rules.get(task_type, "claude-4-opus")
def estimate_cost(self, model: str, input_tokens: int, output_tokens: int) -> float:
"""估算成本"""
pricing = {
"gemini-flash-8b": {"input": 0.0000375, "output": 0.00015},
"gpt-4o-mini": {"input": 0.00015, "output": 0.0006},
"claude-3.5-sonnet": {"input": 0.003, "output": 0.015},
"claude-4-opus": {"input": 0.015, "output": 0.075}
}
model_price = pricing.get(model, pricing["gpt-4o-mini"])
return (input_tokens * model_price["input"] +
output_tokens * model_price["output"]) / 1000
class CostMonitor:
def __init__(self, monthly_budget: float):
self.monthly_budget = monthly_budget
self.current_month_cost = 0
self.daily_costs = defaultdict(float)
self.model_costs = defaultdict(float)
def track_usage(self, model: str, input_tokens: int, output_tokens: int, cost: float):
"""记录使用情况"""
today = datetime.now().date()
self.current_month_cost += cost
self.daily_costs[today] += cost
self.model_costs[model] += cost
# 检查预算
if self.current_month_cost > self.monthly_budget * 0.8:
self.send_budget_alert("80% of monthly budget consumed")
# 异常检测
if cost > self.get_average_request_cost() * 10:
self.send_anomaly_alert(f"Unusual high cost request: ${cost}")
def get_cost_breakdown(self) -> Dict:
"""获取成本分析"""
return {
"total_cost": self.current_month_cost,
"daily_average": sum(self.daily_costs.values()) / len(self.daily_costs),
"model_distribution": dict(self.model_costs),
"budget_remaining": self.monthly_budget - self.current_month_cost,
"projected_monthly": self.project_monthly_cost()
}
def calculate_llm_roi(implementation_cost: float, monthly_savings: Dict) -> Dict:
"""计算LLM项目ROI"""
# 直接收益
direct_savings = {
"人力成本节省": monthly_savings.get("labor", 0),
"效率提升价值": monthly_savings.get("efficiency", 0),
"错误减少收益": monthly_savings.get("error_reduction", 0)
}
# 间接收益
indirect_benefits = {
"客户满意度提升": monthly_savings.get("customer_satisfaction", 0),
"创新能力增强": monthly_savings.get("innovation", 0),
"市场竞争优势": monthly_savings.get("competitive_edge", 0)
}
total_monthly_value = sum(direct_savings.values()) + sum(indirect_benefits.values())
monthly_cost = monthly_savings.get("llm_cost", 0)
net_monthly_benefit = total_monthly_value - monthly_cost
payback_period = implementation_cost / net_monthly_benefit if net_monthly_benefit > 0 else float('inf')
annual_roi = ((net_monthly_benefit * 12 - implementation_cost) / implementation_cost) * 100
return {
"monthly_net_benefit": net_monthly_benefit,
"payback_period_months": payback_period,
"annual_roi_percentage": annual_roi,
"break_even_date": datetime.now() + timedelta(days=payback_period * 30)
}