Understanding LLM Costs
LLM costs are primarily driven by tokens - the units of text processed:
# Approximate pricing (as of Dec 2024)
| Model | Input (per 1M tokens) | Output (per 1M tokens) |
|----------------|----------------------|------------------------|
| GPT-4 Turbo | $10.00 | $30.00 |
| GPT-4o | $5.00 | $15.00 |
| GPT-3.5 Turbo | $0.50 | $1.50 |
| Claude 3 Opus | $15.00 | $75.00 |
| Claude 3 Sonnet| $3.00 | $15.00 |
| Claude 3 Haiku | $0.25 | $1.25 |
# Cost calculation
cost = (input_tokens * input_price + output_tokens * output_price) / 1_000_000
Token Optimization
Reduce Prompt Length
# Before: Verbose prompt (150 tokens)
prompt = """
You are a helpful AI assistant. Your task is to analyze the
following text and provide a comprehensive summary. Please
ensure that your summary captures all the key points and
main ideas from the original text. Be thorough but concise.
Text to summarize:
{text}
"""
# After: Concise prompt (30 tokens)
prompt = """Summarize the key points:
{text}"""
# Savings: 80% reduction in system prompt tokens
Limit Output Length
response = client.chat.completions.create(
model="gpt-4",
messages=[...],
max_tokens=150, # Limit output
temperature=0.7
)
# Better: Give explicit length instructions
prompt = "Answer in 2-3 sentences: {question}"
Truncate Context
def truncate_context(text: str, max_tokens: int = 2000) -> str:
"""Truncate text to stay within token limit."""
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4")
tokens = enc.encode(text)
if len(tokens) > max_tokens:
tokens = tokens[:max_tokens]
return enc.decode(tokens) + "..."
return text
# Use in RAG
context = truncate_context("\n".join(retrieved_docs), max_tokens=2000)
Caching Strategies
Semantic Caching
from langchain.cache import InMemoryCache, RedisSemanticCache
from langchain_openai import OpenAIEmbeddings
import langchain
# Simple in-memory cache
langchain.llm_cache = InMemoryCache()
# Semantic cache - matches similar queries
langchain.llm_cache = RedisSemanticCache(
redis_url="redis://localhost:6379",
embedding=OpenAIEmbeddings(),
score_threshold=0.95 # Similarity threshold
)
# Now duplicate/similar queries hit cache
llm = ChatOpenAI(model="gpt-4")
llm.invoke("What is Python?") # API call
llm.invoke("What is Python?") # Cache hit
llm.invoke("Tell me about Python") # Cache hit (semantic)
Custom Response Cache
import hashlib
import json
import redis
class LLMCache:
def __init__(self, redis_client, ttl=3600):
self.redis = redis_client
self.ttl = ttl
def _key(self, messages: list, model: str) -> str:
content = json.dumps({"messages": messages, "model": model})
return f"llm:{hashlib.sha256(content.encode()).hexdigest()}"
def get(self, messages: list, model: str):
key = self._key(messages, model)
cached = self.redis.get(key)
return json.loads(cached) if cached else None
def set(self, messages: list, model: str, response: dict):
key = self._key(messages, model)
self.redis.setex(key, self.ttl, json.dumps(response))
# Usage
cache = LLMCache(redis.Redis())
def cached_completion(messages, model="gpt-4"):
cached = cache.get(messages, model)
if cached:
return cached
response = client.chat.completions.create(
model=model,
messages=messages
)
cache.set(messages, model, response.dict())
return response
Model Selection Strategy
class SmartModelSelector:
"""Route queries to appropriate models based on complexity."""
def __init__(self):
self.classifier = ChatOpenAI(model="gpt-3.5-turbo") # Cheap classifier
def classify_complexity(self, query: str) -> str:
response = self.classifier.invoke(
f"Classify this query complexity as 'simple', 'medium', or 'complex': {query}"
)
return response.content.strip().lower()
def get_model(self, query: str) -> str:
complexity = self.classify_complexity(query)
if complexity == "simple":
return "gpt-3.5-turbo" # $0.002/1K tokens
elif complexity == "medium":
return "gpt-4o-mini" # $0.15/1K tokens
else:
return "gpt-4" # $10/1K tokens
# Usage
selector = SmartModelSelector()
def smart_query(question: str):
model = selector.get_model(question)
print(f"Using {model} for: {question[:50]}...")
return ChatOpenAI(model=model).invoke(question)
# Simple queries use cheap models
smart_query("What is 2+2?") # Uses GPT-3.5
# Complex queries use powerful models
smart_query("Explain the implications of quantum entanglement...")
Batching Requests
import asyncio
from openai import AsyncOpenAI
client = AsyncOpenAI()
async def batch_process(queries: list[str], batch_size: int = 10):
"""Process queries in batches with rate limiting."""
results = []
for i in range(0, len(queries), batch_size):
batch = queries[i:i + batch_size]
# Process batch concurrently
tasks = [
client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": q}]
)
for q in batch
]
batch_results = await asyncio.gather(*tasks)
results.extend(batch_results)
# Rate limiting delay
await asyncio.sleep(1)
return results
# Process 100 queries efficiently
queries = ["Query " + str(i) for i in range(100)]
results = asyncio.run(batch_process(queries))
Use Local Models
# Use Ollama for development and simple queries
from langchain_community.llms import Ollama
# Local model - $0 cost
local_llm = Ollama(model="llama2")
# Use local for:
# - Development and testing
# - Simple classification
# - Non-critical queries
# - Privacy-sensitive data
def cost_aware_query(query: str, require_quality: bool = False):
if require_quality:
return ChatOpenAI(model="gpt-4").invoke(query)
else:
return local_llm.invoke(query)
Monitor and Track Costs
import tiktoken
from dataclasses import dataclass
from typing import Dict
@dataclass
class UsageTracker:
model_usage: Dict[str, Dict] = None
def __post_init__(self):
self.model_usage = {}
def track(self, model: str, input_tokens: int, output_tokens: int):
if model not in self.model_usage:
self.model_usage[model] = {
"calls": 0, "input_tokens": 0, "output_tokens": 0
}
self.model_usage[model]["calls"] += 1
self.model_usage[model]["input_tokens"] += input_tokens
self.model_usage[model]["output_tokens"] += output_tokens
def get_costs(self, prices: dict) -> dict:
costs = {}
for model, usage in self.model_usage.items():
if model in prices:
input_cost = usage["input_tokens"] * prices[model]["input"] / 1_000_000
output_cost = usage["output_tokens"] * prices[model]["output"] / 1_000_000
costs[model] = input_cost + output_cost
return costs
# Usage
tracker = UsageTracker()
def tracked_completion(messages, model="gpt-4"):
response = client.chat.completions.create(model=model, messages=messages)
tracker.track(
model=model,
input_tokens=response.usage.prompt_tokens,
output_tokens=response.usage.completion_tokens
)
return response
# Check costs
prices = {
"gpt-4": {"input": 10, "output": 30},
"gpt-3.5-turbo": {"input": 0.5, "output": 1.5}
}
print(tracker.get_costs(prices))
Cost Optimization Checklist
Prompt Engineering
- Keep prompts concise
- Limit output length
- Remove redundant context
Caching
- Cache common queries
- Use semantic caching
- Set appropriate TTLs
Model Selection
- Use cheaper models when possible
- Route by complexity
- Consider local models
Monitoring
- Track token usage
- Set budget alerts
- Review usage patterns
Learn Production AI Cost Management
Our Agentic AI program covers cost optimization for production deployments. Build efficient, cost-effective AI applications.
Explore Agentic AI Program