LLM Application Patterns: Building Production-Grade AI FeaturesΒΆ

Calling an LLM is easy. Building something reliable is hard. This notebook covers the patterns that separate demos from production: structured output, function calling, few-shot prompting, retrieval augmentation, and chain-of-thought reasoning.

# !pip install openai anthropic pydantic

import json
import re
from typing import Optional, List
from pydantic import BaseModel, Field
import warnings
warnings.filterwarnings('ignore')

try:
    from openai import OpenAI
    client = OpenAI()  # reads OPENAI_API_KEY from env
    HAS_OPENAI = True
    print('OpenAI client initialized')
except Exception:
    HAS_OPENAI = False
    print('OpenAI not available β€” showing patterns with mock outputs')

def mock_llm(prompt: str, system: str = '', seed: int = 0) -> str:
    """Simulated LLM response for demonstration."""
    return '[LLM response would appear here β€” set OPENAI_API_KEY to run live]'

1. Structured Output β€” Getting Reliable JSON from LLMsΒΆ

# Pattern 1: Pydantic schema validation
class ProductReview(BaseModel):
    sentiment: str = Field(..., description='positive, negative, or neutral')
    score: float = Field(..., ge=0.0, le=10.0, description='Rating from 0-10')
    key_themes: List[str] = Field(..., description='Up to 3 main themes')
    summary: str = Field(..., max_length=200, description='One-sentence summary')

REVIEW_SYSTEM = """You are a product review analyzer. Extract information from reviews 
and return ONLY valid JSON matching this schema:
{
  "sentiment": "positive | negative | neutral",
  "score": <float 0-10>,
  "key_themes": ["theme1", "theme2"],
  "summary": "<one sentence>"
}"""

def analyze_review(review_text: str) -> ProductReview:
    if HAS_OPENAI:
        response = client.chat.completions.create(
            model='gpt-4o-mini',
            messages=[
                {'role': 'system', 'content': REVIEW_SYSTEM},
                {'role': 'user', 'content': review_text},
            ],
            response_format={'type': 'json_object'},  # Forces JSON output
            temperature=0,  # Deterministic for structured extraction
        )
        raw = response.choices[0].message.content
    else:
        # Simulated response
        raw = json.dumps({
            'sentiment': 'positive',
            'score': 8.5,
            'key_themes': ['battery life', 'design', 'value'],
            'summary': 'Excellent laptop with great battery and sleek design at a fair price.'
        })
    
    data = json.loads(raw)
    return ProductReview(**data)  # Pydantic validates types + constraints

review = """Just got this laptop and I'm blown away. The battery lasts all day (12 hours+)
and the design is super clean. Build quality feels premium. 
Priced fairly for what you get. Minor complaint: keyboard is a bit shallow."""

result = analyze_review(review)
print(f'Sentiment: {result.sentiment}')
print(f'Score: {result.score}/10')
print(f'Themes: {result.key_themes}')
print(f'Summary: {result.summary}')

2. Function Calling β€” Connecting LLMs to Real ToolsΒΆ

import datetime

# Define tools/functions the LLM can call
tools = [
    {
        'type': 'function',
        'function': {
            'name': 'get_weather',
            'description': 'Get current weather for a city',
            'parameters': {
                'type': 'object',
                'properties': {
                    'city': {'type': 'string', 'description': 'City name'},
                    'units': {'type': 'string', 'enum': ['celsius', 'fahrenheit'], 'default': 'celsius'}
                },
                'required': ['city']
            }
        }
    },
    {
        'type': 'function',
        'function': {
            'name': 'create_calendar_event',
            'description': 'Create a calendar event',
            'parameters': {
                'type': 'object',
                'properties': {
                    'title': {'type': 'string'},
                    'date': {'type': 'string', 'description': 'ISO date YYYY-MM-DD'},
                    'duration_hours': {'type': 'number'}
                },
                'required': ['title', 'date']
            }
        }
    }
]

# Mock tool implementations
def get_weather(city: str, units: str = 'celsius') -> dict:
    weather_db = {
        'london': {'temp': 12, 'condition': 'Cloudy'},
        'new york': {'temp': 22, 'condition': 'Sunny'},
        'tokyo': {'temp': 18, 'condition': 'Partly cloudy'},
    }
    data = weather_db.get(city.lower(), {'temp': 20, 'condition': 'Unknown'})
    unit_sym = 'Β°C' if units == 'celsius' else 'Β°F'
    temp = data['temp'] if units == 'celsius' else data['temp'] * 9/5 + 32
    return {'city': city, 'temperature': f'{temp}{unit_sym}', 'condition': data['condition']}

def create_calendar_event(title: str, date: str, duration_hours: float = 1.0) -> dict:
    return {'status': 'created', 'event': title, 'date': date, 'duration': f'{duration_hours}h'}

TOOL_MAP = {'get_weather': get_weather, 'create_calendar_event': create_calendar_event}

def run_tool_loop(user_message: str) -> str:
    """Agentic loop: LLM decides which tools to call, we execute them."""
    messages = [{'role': 'user', 'content': user_message}]
    
    if not HAS_OPENAI:
        # Simulate a function call
        print(f'User: {user_message}')
        print('LLM decides to call: get_weather(city="London")')
        result = get_weather('London')
        print(f'Tool result: {result}')
        print('LLM final response: "In London right now it is 12Β°C and Cloudy."')
        return 'Simulated response'
    
    while True:
        response = client.chat.completions.create(
            model='gpt-4o-mini',
            messages=messages,
            tools=tools,
            tool_choice='auto',
        )
        
        msg = response.choices[0].message
        messages.append(msg)
        
        if response.choices[0].finish_reason == 'stop':
            return msg.content  # Final answer
        
        # Execute tool calls
        for tool_call in msg.tool_calls or []:
            fn_name = tool_call.function.name
            fn_args = json.loads(tool_call.function.arguments)
            print(f'Calling {fn_name}({fn_args})')
            fn_result = TOOL_MAP[fn_name](**fn_args)
            messages.append({
                'role': 'tool',
                'tool_call_id': tool_call.id,
                'content': json.dumps(fn_result)
            })

answer = run_tool_loop('What is the weather in London? Also schedule a team meeting for 2024-03-15.')
print(f'\nFinal: {answer}')

3. Few-Shot Prompting β€” Teaching by ExampleΒΆ

# Few-shot outperforms zero-shot for complex or domain-specific tasks
# The key: examples must be representative of the full input distribution

FEW_SHOT_SYSTEM = """Extract SQL queries from natural language questions.
Return ONLY the SQL query, nothing else.

Examples:
Q: How many users signed up last month?
A: SELECT COUNT(*) FROM users WHERE signup_date >= DATE_TRUNC('month', NOW() - INTERVAL '1 month');

Q: What are the top 5 products by revenue in Q4 2023?
A: SELECT product_name, SUM(quantity * price) as revenue FROM orders WHERE order_date BETWEEN '2023-10-01' AND '2023-12-31' GROUP BY product_name ORDER BY revenue DESC LIMIT 5;

Q: Find all customers who made more than 3 purchases but never left a review.
A: SELECT c.customer_id, c.name FROM customers c JOIN orders o ON c.customer_id = o.customer_id LEFT JOIN reviews r ON c.customer_id = r.customer_id WHERE r.review_id IS NULL GROUP BY c.customer_id, c.name HAVING COUNT(o.order_id) > 3;
"""

def text_to_sql(question: str) -> str:
    if HAS_OPENAI:
        response = client.chat.completions.create(
            model='gpt-4o-mini',
            messages=[
                {'role': 'system', 'content': FEW_SHOT_SYSTEM},
                {'role': 'user', 'content': f'Q: {question}'},
            ],
            temperature=0,
        )
        return response.choices[0].message.content
    else:
        return 'SELECT AVG(session_duration) FROM user_sessions WHERE user_id IN (SELECT user_id FROM users WHERE plan = \'premium\') AND session_date >= \'2024-01-01\';'

questions = [
    'What is the average session duration for premium users in 2024?',
    'Show me all orders over $500 from customers in California',
]

for q in questions:
    sql = text_to_sql(q)
    print(f'Q: {q}')
    print(f'SQL: {sql}')
    print()

4. Chain-of-Thought β€” Making LLMs Show Their WorkΒΆ

# Chain-of-thought (CoT) dramatically improves multi-step reasoning
# Technique: add 'Think step by step' or show reasoning in examples

def solve_with_cot(problem: str, use_cot: bool = True) -> str:
    if use_cot:
        system = """Solve problems by thinking step by step.
Format:
THINKING:
<your reasoning>

ANSWER:
<final answer only>"""
    else:
        system = 'Answer the following question directly. Give only the final answer.'
    
    if HAS_OPENAI:
        response = client.chat.completions.create(
            model='gpt-4o-mini',
            messages=[
                {'role': 'system', 'content': system},
                {'role': 'user', 'content': problem},
            ],
            temperature=0,
        )
        return response.choices[0].message.content
    else:
        if use_cot:
            return """THINKING:
A store has 15 items at $12 each.
Total revenue = 15 Γ— $12 = $180
Discount is 20%, so discount amount = $180 Γ— 0.20 = $36
Final price = $180 - $36 = $144

ANSWER:
$144"""
        else:
            return '$156'  # Wrong without CoT

problem = 'A store sells 15 items at $12 each, then applies a 20% discount to the total. What is the final price?'

print('=== WITHOUT Chain-of-Thought ===')
print(solve_with_cot(problem, use_cot=False))
print()
print('=== WITH Chain-of-Thought ===')
print(solve_with_cot(problem, use_cot=True))
print()
print('Research finding: CoT improves accuracy on math/logic benchmarks by 20-40%')
print('Best practices:')
print('  - Works best with larger models (GPT-4, Claude 3+)')
print('  - Use temperature=0 for reproducible reasoning')
print('  - Self-consistency: run CoT 3-5x, majority-vote the answer')

5. Retrieval-Augmented Generation (RAG) β€” Grounding LLMs in FactsΒΆ

# Minimal RAG: embed docs β†’ store vectors β†’ retrieve relevant context β†’ answer
# This uses TF-IDF for retrieval (real RAG uses dense embeddings)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Knowledge base: company FAQ
documents = [
    {'id': 1, 'title': 'Refund Policy', 'content': 'Customers can request a full refund within 30 days of purchase. After 30 days, store credit is offered. Digital products are non-refundable once downloaded.'},
    {'id': 2, 'title': 'Shipping Info', 'content': 'Standard shipping takes 5-7 business days. Express shipping (2-3 days) is available for $12.99. Free shipping on orders over $50.'},
    {'id': 3, 'title': 'Account Security', 'content': 'We use 256-bit AES encryption. Two-factor authentication is available and recommended. Passwords must be at least 12 characters with mixed case and numbers.'},
    {'id': 4, 'title': 'Subscription Plans', 'content': 'Basic: $9/month (10GB storage). Pro: $29/month (100GB + priority support). Enterprise: $99/month (unlimited + SLA). All plans include a 14-day free trial.'},
    {'id': 5, 'title': 'API Rate Limits', 'content': 'Free tier: 100 requests/day. Pro: 10,000 requests/day. Enterprise: unlimited. Rate limit headers are returned with each response (X-RateLimit-Remaining).'},
]

class SimpleRAG:
    def __init__(self, documents: list):
        self.documents = documents
        self.vectorizer = TfidfVectorizer(stop_words='english')
        texts = [f"{d['title']} {d['content']}" for d in documents]
        self.doc_vectors = self.vectorizer.fit_transform(texts)
    
    def retrieve(self, query: str, top_k: int = 2) -> list:
        """Find most relevant documents for query."""
        query_vec = self.vectorizer.transform([query])
        scores = cosine_similarity(query_vec, self.doc_vectors)[0]
        top_indices = scores.argsort()[-top_k:][::-1]
        return [
            {'doc': self.documents[i], 'score': scores[i]}
            for i in top_indices if scores[i] > 0
        ]
    
    def answer(self, question: str) -> str:
        """Retrieve context, then generate answer."""
        retrieved = self.retrieve(question)
        
        if not retrieved:
            context = 'No relevant documents found.'
        else:
            context = '\n\n'.join([
                f"[{r['doc']['title']}]\n{r['doc']['content']}"
                for r in retrieved
            ])
        
        print(f'Retrieved {len(retrieved)} documents:')
        for r in retrieved:
            print(f'  - {r["doc"]["title"]} (score: {r["score"]:.3f})')
        
        system = f'Answer using only the provided context. If the answer is not in the context, say so.\n\nContext:\n{context}'
        
        if HAS_OPENAI:
            response = client.chat.completions.create(
                model='gpt-4o-mini',
                messages=[
                    {'role': 'system', 'content': system},
                    {'role': 'user', 'content': question},
                ],
                temperature=0,
            )
            return response.choices[0].message.content
        else:
            return f'[LLM would answer using retrieved context about: {[r["doc"]["title"] for r in retrieved]}]'

rag = SimpleRAG(documents)

questions = [
    'Can I get a refund after 2 months?',
    'How much does express shipping cost?',
    'What are the API limits for free users?',
]

for q in questions:
    print(f'\nQ: {q}')
    answer = rag.answer(q)
    print(f'A: {answer}')

6. Prompt Templates β€” Reusable, Testable PromptsΒΆ

from string import Template
from dataclasses import dataclass
from typing import Callable

@dataclass
class PromptTemplate:
    """Reusable prompt template with variable substitution and validation."""
    name: str
    system: str
    user_template: str
    variables: list
    validator: Optional[Callable] = None
    
    def format(self, **kwargs) -> dict:
        missing = [v for v in self.variables if v not in kwargs]
        if missing:
            raise ValueError(f'Missing variables: {missing}')
        
        user_msg = self.user_template
        for k, v in kwargs.items():
            user_msg = user_msg.replace(f'{{{k}}}', str(v))
        
        return {'system': self.system, 'user': user_msg}
    
    def render(self, **kwargs) -> str:
        msgs = self.format(**kwargs)
        if HAS_OPENAI:
            response = client.chat.completions.create(
                model='gpt-4o-mini',
                messages=[
                    {'role': 'system', 'content': msgs['system']},
                    {'role': 'user', 'content': msgs['user']},
                ],
                temperature=0.7,
            )
            output = response.choices[0].message.content
        else:
            output = f'[Rendered prompt: {msgs["user"][:100]}...]'
        
        if self.validator and not self.validator(output):
            raise ValueError(f'Output failed validation: {output[:100]}')
        return output

# Define reusable templates
email_template = PromptTemplate(
    name='professional_email',
    system='Write professional business emails. Be concise and clear.',
    user_template='Write an email to {recipient} about {subject}. Tone: {tone}. Key points: {points}',
    variables=['recipient', 'subject', 'tone', 'points'],
)

code_review_template = PromptTemplate(
    name='code_review',
    system='Review code for bugs, security issues, and improvements. Be specific.',
    user_template='Review this {language} code:\n```\n{code}\n```\nFocus on: {focus_areas}',
    variables=['language', 'code', 'focus_areas'],
)

# Use templates
print('Email Template:')
rendered = email_template.render(
    recipient='the engineering team',
    subject='new deployment schedule',
    tone='friendly but professional',
    points='deploy Thursday 6pm, rollback plan ready, monitor for 24h'
)
print(rendered[:200] + '...' if len(rendered) > 200 else rendered)

LLM Pattern Cheat SheetΒΆ

Pattern              When to Use                    Key Parameter
─────────────────────────────────────────────────────────────────
Zero-shot            Simple tasks, clear intent      temperature=0
Few-shot             Domain-specific, format matters 3-5 examples
Chain-of-Thought     Math, logic, multi-step         'Think step by step'
Structured Output    When you need parseable data    response_format=json
Function Calling     LLM needs external data/tools  tools=[...]
RAG                  Factual Q&A on private docs     top_k=3-5 chunks
Self-Consistency     High-stakes decisions           Run 5x, majority vote

Temperature Guide:
  0.0  β†’ Deterministic, structured extraction, code generation
  0.3  β†’ Factual Q&A with slight variation acceptable
  0.7  β†’ Creative writing, brainstorming, varied outputs
  1.0+ β†’ Maximum diversity (usually too random for production)

Cost Optimization:
  - Cache identical prompts (exact match)
  - Use GPT-4o-mini / Claude Haiku for simple tasks
  - Reserve GPT-4o / Claude Sonnet for complex reasoning
  - Truncate context to what's actually needed

ExercisesΒΆ

  1. Add retry logic with exponential backoff to analyze_review() for handling API rate limits.

  2. Extend the SimpleRAG to use OpenAI embeddings (text-embedding-3-small) instead of TF-IDF.

  3. Implement a β€˜self-consistency’ wrapper that calls an LLM N times and returns the majority answer.

  4. Build a conversation memory system that summarizes older messages when context exceeds 4096 tokens.

  5. Create a PromptTemplate for generating unit tests, with validation that the output contains def test_.