LLM Application Patterns: Building Production-Grade AI FeaturesΒΆ
Calling an LLM is easy. Building something reliable is hard. This notebook covers the patterns that separate demos from production: structured output, function calling, few-shot prompting, retrieval augmentation, and chain-of-thought reasoning.
# !pip install openai anthropic pydantic
import json
import re
from typing import Optional, List
from pydantic import BaseModel, Field
import warnings
warnings.filterwarnings('ignore')
try:
from openai import OpenAI
client = OpenAI() # reads OPENAI_API_KEY from env
HAS_OPENAI = True
print('OpenAI client initialized')
except Exception:
HAS_OPENAI = False
print('OpenAI not available β showing patterns with mock outputs')
def mock_llm(prompt: str, system: str = '', seed: int = 0) -> str:
"""Simulated LLM response for demonstration."""
return '[LLM response would appear here β set OPENAI_API_KEY to run live]'
1. Structured Output β Getting Reliable JSON from LLMsΒΆ
# Pattern 1: Pydantic schema validation
class ProductReview(BaseModel):
sentiment: str = Field(..., description='positive, negative, or neutral')
score: float = Field(..., ge=0.0, le=10.0, description='Rating from 0-10')
key_themes: List[str] = Field(..., description='Up to 3 main themes')
summary: str = Field(..., max_length=200, description='One-sentence summary')
REVIEW_SYSTEM = """You are a product review analyzer. Extract information from reviews
and return ONLY valid JSON matching this schema:
{
"sentiment": "positive | negative | neutral",
"score": <float 0-10>,
"key_themes": ["theme1", "theme2"],
"summary": "<one sentence>"
}"""
def analyze_review(review_text: str) -> ProductReview:
if HAS_OPENAI:
response = client.chat.completions.create(
model='gpt-4o-mini',
messages=[
{'role': 'system', 'content': REVIEW_SYSTEM},
{'role': 'user', 'content': review_text},
],
response_format={'type': 'json_object'}, # Forces JSON output
temperature=0, # Deterministic for structured extraction
)
raw = response.choices[0].message.content
else:
# Simulated response
raw = json.dumps({
'sentiment': 'positive',
'score': 8.5,
'key_themes': ['battery life', 'design', 'value'],
'summary': 'Excellent laptop with great battery and sleek design at a fair price.'
})
data = json.loads(raw)
return ProductReview(**data) # Pydantic validates types + constraints
review = """Just got this laptop and I'm blown away. The battery lasts all day (12 hours+)
and the design is super clean. Build quality feels premium.
Priced fairly for what you get. Minor complaint: keyboard is a bit shallow."""
result = analyze_review(review)
print(f'Sentiment: {result.sentiment}')
print(f'Score: {result.score}/10')
print(f'Themes: {result.key_themes}')
print(f'Summary: {result.summary}')
2. Function Calling β Connecting LLMs to Real ToolsΒΆ
import datetime
# Define tools/functions the LLM can call
tools = [
{
'type': 'function',
'function': {
'name': 'get_weather',
'description': 'Get current weather for a city',
'parameters': {
'type': 'object',
'properties': {
'city': {'type': 'string', 'description': 'City name'},
'units': {'type': 'string', 'enum': ['celsius', 'fahrenheit'], 'default': 'celsius'}
},
'required': ['city']
}
}
},
{
'type': 'function',
'function': {
'name': 'create_calendar_event',
'description': 'Create a calendar event',
'parameters': {
'type': 'object',
'properties': {
'title': {'type': 'string'},
'date': {'type': 'string', 'description': 'ISO date YYYY-MM-DD'},
'duration_hours': {'type': 'number'}
},
'required': ['title', 'date']
}
}
}
]
# Mock tool implementations
def get_weather(city: str, units: str = 'celsius') -> dict:
weather_db = {
'london': {'temp': 12, 'condition': 'Cloudy'},
'new york': {'temp': 22, 'condition': 'Sunny'},
'tokyo': {'temp': 18, 'condition': 'Partly cloudy'},
}
data = weather_db.get(city.lower(), {'temp': 20, 'condition': 'Unknown'})
unit_sym = 'Β°C' if units == 'celsius' else 'Β°F'
temp = data['temp'] if units == 'celsius' else data['temp'] * 9/5 + 32
return {'city': city, 'temperature': f'{temp}{unit_sym}', 'condition': data['condition']}
def create_calendar_event(title: str, date: str, duration_hours: float = 1.0) -> dict:
return {'status': 'created', 'event': title, 'date': date, 'duration': f'{duration_hours}h'}
TOOL_MAP = {'get_weather': get_weather, 'create_calendar_event': create_calendar_event}
def run_tool_loop(user_message: str) -> str:
"""Agentic loop: LLM decides which tools to call, we execute them."""
messages = [{'role': 'user', 'content': user_message}]
if not HAS_OPENAI:
# Simulate a function call
print(f'User: {user_message}')
print('LLM decides to call: get_weather(city="London")')
result = get_weather('London')
print(f'Tool result: {result}')
print('LLM final response: "In London right now it is 12Β°C and Cloudy."')
return 'Simulated response'
while True:
response = client.chat.completions.create(
model='gpt-4o-mini',
messages=messages,
tools=tools,
tool_choice='auto',
)
msg = response.choices[0].message
messages.append(msg)
if response.choices[0].finish_reason == 'stop':
return msg.content # Final answer
# Execute tool calls
for tool_call in msg.tool_calls or []:
fn_name = tool_call.function.name
fn_args = json.loads(tool_call.function.arguments)
print(f'Calling {fn_name}({fn_args})')
fn_result = TOOL_MAP[fn_name](**fn_args)
messages.append({
'role': 'tool',
'tool_call_id': tool_call.id,
'content': json.dumps(fn_result)
})
answer = run_tool_loop('What is the weather in London? Also schedule a team meeting for 2024-03-15.')
print(f'\nFinal: {answer}')
3. Few-Shot Prompting β Teaching by ExampleΒΆ
# Few-shot outperforms zero-shot for complex or domain-specific tasks
# The key: examples must be representative of the full input distribution
FEW_SHOT_SYSTEM = """Extract SQL queries from natural language questions.
Return ONLY the SQL query, nothing else.
Examples:
Q: How many users signed up last month?
A: SELECT COUNT(*) FROM users WHERE signup_date >= DATE_TRUNC('month', NOW() - INTERVAL '1 month');
Q: What are the top 5 products by revenue in Q4 2023?
A: SELECT product_name, SUM(quantity * price) as revenue FROM orders WHERE order_date BETWEEN '2023-10-01' AND '2023-12-31' GROUP BY product_name ORDER BY revenue DESC LIMIT 5;
Q: Find all customers who made more than 3 purchases but never left a review.
A: SELECT c.customer_id, c.name FROM customers c JOIN orders o ON c.customer_id = o.customer_id LEFT JOIN reviews r ON c.customer_id = r.customer_id WHERE r.review_id IS NULL GROUP BY c.customer_id, c.name HAVING COUNT(o.order_id) > 3;
"""
def text_to_sql(question: str) -> str:
if HAS_OPENAI:
response = client.chat.completions.create(
model='gpt-4o-mini',
messages=[
{'role': 'system', 'content': FEW_SHOT_SYSTEM},
{'role': 'user', 'content': f'Q: {question}'},
],
temperature=0,
)
return response.choices[0].message.content
else:
return 'SELECT AVG(session_duration) FROM user_sessions WHERE user_id IN (SELECT user_id FROM users WHERE plan = \'premium\') AND session_date >= \'2024-01-01\';'
questions = [
'What is the average session duration for premium users in 2024?',
'Show me all orders over $500 from customers in California',
]
for q in questions:
sql = text_to_sql(q)
print(f'Q: {q}')
print(f'SQL: {sql}')
print()
4. Chain-of-Thought β Making LLMs Show Their WorkΒΆ
# Chain-of-thought (CoT) dramatically improves multi-step reasoning
# Technique: add 'Think step by step' or show reasoning in examples
def solve_with_cot(problem: str, use_cot: bool = True) -> str:
if use_cot:
system = """Solve problems by thinking step by step.
Format:
THINKING:
<your reasoning>
ANSWER:
<final answer only>"""
else:
system = 'Answer the following question directly. Give only the final answer.'
if HAS_OPENAI:
response = client.chat.completions.create(
model='gpt-4o-mini',
messages=[
{'role': 'system', 'content': system},
{'role': 'user', 'content': problem},
],
temperature=0,
)
return response.choices[0].message.content
else:
if use_cot:
return """THINKING:
A store has 15 items at $12 each.
Total revenue = 15 Γ $12 = $180
Discount is 20%, so discount amount = $180 Γ 0.20 = $36
Final price = $180 - $36 = $144
ANSWER:
$144"""
else:
return '$156' # Wrong without CoT
problem = 'A store sells 15 items at $12 each, then applies a 20% discount to the total. What is the final price?'
print('=== WITHOUT Chain-of-Thought ===')
print(solve_with_cot(problem, use_cot=False))
print()
print('=== WITH Chain-of-Thought ===')
print(solve_with_cot(problem, use_cot=True))
print()
print('Research finding: CoT improves accuracy on math/logic benchmarks by 20-40%')
print('Best practices:')
print(' - Works best with larger models (GPT-4, Claude 3+)')
print(' - Use temperature=0 for reproducible reasoning')
print(' - Self-consistency: run CoT 3-5x, majority-vote the answer')
5. Retrieval-Augmented Generation (RAG) β Grounding LLMs in FactsΒΆ
# Minimal RAG: embed docs β store vectors β retrieve relevant context β answer
# This uses TF-IDF for retrieval (real RAG uses dense embeddings)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Knowledge base: company FAQ
documents = [
{'id': 1, 'title': 'Refund Policy', 'content': 'Customers can request a full refund within 30 days of purchase. After 30 days, store credit is offered. Digital products are non-refundable once downloaded.'},
{'id': 2, 'title': 'Shipping Info', 'content': 'Standard shipping takes 5-7 business days. Express shipping (2-3 days) is available for $12.99. Free shipping on orders over $50.'},
{'id': 3, 'title': 'Account Security', 'content': 'We use 256-bit AES encryption. Two-factor authentication is available and recommended. Passwords must be at least 12 characters with mixed case and numbers.'},
{'id': 4, 'title': 'Subscription Plans', 'content': 'Basic: $9/month (10GB storage). Pro: $29/month (100GB + priority support). Enterprise: $99/month (unlimited + SLA). All plans include a 14-day free trial.'},
{'id': 5, 'title': 'API Rate Limits', 'content': 'Free tier: 100 requests/day. Pro: 10,000 requests/day. Enterprise: unlimited. Rate limit headers are returned with each response (X-RateLimit-Remaining).'},
]
class SimpleRAG:
def __init__(self, documents: list):
self.documents = documents
self.vectorizer = TfidfVectorizer(stop_words='english')
texts = [f"{d['title']} {d['content']}" for d in documents]
self.doc_vectors = self.vectorizer.fit_transform(texts)
def retrieve(self, query: str, top_k: int = 2) -> list:
"""Find most relevant documents for query."""
query_vec = self.vectorizer.transform([query])
scores = cosine_similarity(query_vec, self.doc_vectors)[0]
top_indices = scores.argsort()[-top_k:][::-1]
return [
{'doc': self.documents[i], 'score': scores[i]}
for i in top_indices if scores[i] > 0
]
def answer(self, question: str) -> str:
"""Retrieve context, then generate answer."""
retrieved = self.retrieve(question)
if not retrieved:
context = 'No relevant documents found.'
else:
context = '\n\n'.join([
f"[{r['doc']['title']}]\n{r['doc']['content']}"
for r in retrieved
])
print(f'Retrieved {len(retrieved)} documents:')
for r in retrieved:
print(f' - {r["doc"]["title"]} (score: {r["score"]:.3f})')
system = f'Answer using only the provided context. If the answer is not in the context, say so.\n\nContext:\n{context}'
if HAS_OPENAI:
response = client.chat.completions.create(
model='gpt-4o-mini',
messages=[
{'role': 'system', 'content': system},
{'role': 'user', 'content': question},
],
temperature=0,
)
return response.choices[0].message.content
else:
return f'[LLM would answer using retrieved context about: {[r["doc"]["title"] for r in retrieved]}]'
rag = SimpleRAG(documents)
questions = [
'Can I get a refund after 2 months?',
'How much does express shipping cost?',
'What are the API limits for free users?',
]
for q in questions:
print(f'\nQ: {q}')
answer = rag.answer(q)
print(f'A: {answer}')
6. Prompt Templates β Reusable, Testable PromptsΒΆ
from string import Template
from dataclasses import dataclass
from typing import Callable
@dataclass
class PromptTemplate:
"""Reusable prompt template with variable substitution and validation."""
name: str
system: str
user_template: str
variables: list
validator: Optional[Callable] = None
def format(self, **kwargs) -> dict:
missing = [v for v in self.variables if v not in kwargs]
if missing:
raise ValueError(f'Missing variables: {missing}')
user_msg = self.user_template
for k, v in kwargs.items():
user_msg = user_msg.replace(f'{{{k}}}', str(v))
return {'system': self.system, 'user': user_msg}
def render(self, **kwargs) -> str:
msgs = self.format(**kwargs)
if HAS_OPENAI:
response = client.chat.completions.create(
model='gpt-4o-mini',
messages=[
{'role': 'system', 'content': msgs['system']},
{'role': 'user', 'content': msgs['user']},
],
temperature=0.7,
)
output = response.choices[0].message.content
else:
output = f'[Rendered prompt: {msgs["user"][:100]}...]'
if self.validator and not self.validator(output):
raise ValueError(f'Output failed validation: {output[:100]}')
return output
# Define reusable templates
email_template = PromptTemplate(
name='professional_email',
system='Write professional business emails. Be concise and clear.',
user_template='Write an email to {recipient} about {subject}. Tone: {tone}. Key points: {points}',
variables=['recipient', 'subject', 'tone', 'points'],
)
code_review_template = PromptTemplate(
name='code_review',
system='Review code for bugs, security issues, and improvements. Be specific.',
user_template='Review this {language} code:\n```\n{code}\n```\nFocus on: {focus_areas}',
variables=['language', 'code', 'focus_areas'],
)
# Use templates
print('Email Template:')
rendered = email_template.render(
recipient='the engineering team',
subject='new deployment schedule',
tone='friendly but professional',
points='deploy Thursday 6pm, rollback plan ready, monitor for 24h'
)
print(rendered[:200] + '...' if len(rendered) > 200 else rendered)
LLM Pattern Cheat SheetΒΆ
Pattern When to Use Key Parameter
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Zero-shot Simple tasks, clear intent temperature=0
Few-shot Domain-specific, format matters 3-5 examples
Chain-of-Thought Math, logic, multi-step 'Think step by step'
Structured Output When you need parseable data response_format=json
Function Calling LLM needs external data/tools tools=[...]
RAG Factual Q&A on private docs top_k=3-5 chunks
Self-Consistency High-stakes decisions Run 5x, majority vote
Temperature Guide:
0.0 β Deterministic, structured extraction, code generation
0.3 β Factual Q&A with slight variation acceptable
0.7 β Creative writing, brainstorming, varied outputs
1.0+ β Maximum diversity (usually too random for production)
Cost Optimization:
- Cache identical prompts (exact match)
- Use GPT-4o-mini / Claude Haiku for simple tasks
- Reserve GPT-4o / Claude Sonnet for complex reasoning
- Truncate context to what's actually needed
ExercisesΒΆ
Add retry logic with exponential backoff to
analyze_review()for handling API rate limits.Extend the
SimpleRAGto use OpenAI embeddings (text-embedding-3-small) instead of TF-IDF.Implement a βself-consistencyβ wrapper that calls an LLM N times and returns the majority answer.
Build a conversation memory system that summarizes older messages when context exceeds 4096 tokens.
Create a
PromptTemplatefor generating unit tests, with validation that the output containsdef test_.