Local LLM Servers and APIsΒΆ
Run local LLMs as API servers β OpenAI-compatible, low latency, fully private.
Why Run a Local LLM Server?ΒΆ
Benefit |
Details |
|---|---|
OpenAI-compatible API |
Drop in |
Cost |
Zero per-token charges |
Privacy |
Requests never leave your network |
Latency |
Sub-second first token on modern hardware |
Control |
Pin model versions, tune parameters |
Throughput |
Scale horizontally with multiple replicas |
Server Options at a GlanceΒΆ
Server |
Best For |
GPU Required |
OpenAI API |
|---|---|---|---|
Ollama |
Development, macOS, ease of use |
No (CPU fallback) |
Yes ( |
llama.cpp server |
CPU-only machines, lowest latency |
No |
Yes |
LM Studio |
GUI, Windows/macOS users |
No |
Yes |
vLLM |
Production GPU throughput |
Yes (CUDA) |
Yes |
This notebook shows how to interact with all four from Python.
SetupΒΆ
!pip install -q requests openai ollama subprocess32
import json
import time
import subprocess
import requests
from typing import List, Dict, Optional, Iterator
import ollama
from openai import OpenAI
print("Imports ready.")
Part 1 β Ollama as API ServerΒΆ
Ollama starts an HTTP server on port 11434 by default. It exposes:
Its own REST API (
/api/...)An OpenAI-compatible API at
/v1/...
Start the server:
ollama serve # foreground
# or it starts automatically when you run any model
1.1 REST API EndpointsΒΆ
OLLAMA_BASE = "http://localhost:11434"
def ollama_request(endpoint: str, payload: dict = None, method: str = "POST") -> dict:
"""Send a request to the Ollama REST API."""
url = f"{OLLAMA_BASE}{endpoint}"
try:
if method == "GET":
response = requests.get(url, timeout=30)
else:
response = requests.post(url, json=payload, timeout=60)
response.raise_for_status()
return response.json()
except requests.exceptions.ConnectionError:
print(f"Cannot connect to Ollama at {OLLAMA_BASE}. Run: ollama serve")
return {}
# Health check
try:
r = requests.get(OLLAMA_BASE, timeout=3)
print(f"Ollama server: UP (status {r.status_code})")
except:
print("Ollama server: DOWN β run 'ollama serve'")
# ββ /api/tags β list installed models βββββββββββββββββββββββββββββββββββββββββ
tags = ollama_request("/api/tags", method="GET")
print("Installed models:")
for m in tags.get("models", []):
size_gb = m.get("size", 0) / 1e9
print(f" {m['name']:<35} {size_gb:.1f} GB")
# ββ /api/generate β single-turn completion ββββββββββββββββββββββββββββββββββββ
payload = {
"model": "llama3.2",
"prompt": "Name three benefits of running LLMs locally.",
"stream": False,
"options": {"temperature": 0.2, "num_predict": 150},
}
result = ollama_request("/api/generate", payload)
print(result.get("response", "(no response)"))
print(f"\nTokens: {result.get('eval_count', '?')} | Time: {result.get('eval_duration', 0)/1e9:.2f}s")
# ββ /api/chat β multi-turn chat ββββββββββββββββββββββββββββββββββββββββββββββββ
chat_payload = {
"model": "llama3.2",
"messages": [
{"role": "system", "content": "You are a concise technical assistant."},
{"role": "user", "content": "What is a GGUF file?"},
],
"stream": False,
}
chat_result = ollama_request("/api/chat", chat_payload)
print(chat_result.get("message", {}).get("content", "(no response)"))
# ββ Streaming /api/chat βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def ollama_stream_chat(messages: List[dict], model: str = "llama3.2"):
"""Stream tokens from Ollama REST API."""
url = f"{OLLAMA_BASE}/api/chat"
payload = {"model": model, "messages": messages, "stream": True}
with requests.post(url, json=payload, stream=True, timeout=120) as resp:
resp.raise_for_status()
for line in resp.iter_lines():
if line:
chunk = json.loads(line)
token = chunk.get("message", {}).get("content", "")
if token:
print(token, end="", flush=True)
if chunk.get("done"):
print() # newline
break
print("Streaming response:\n")
ollama_stream_chat([
{"role": "user", "content": "Explain quantisation in one short paragraph."}
])
1.2 Model Management APIΒΆ
# ββ Show model details βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
show_payload = {"name": "llama3.2"}
details = ollama_request("/api/show", show_payload)
if details:
model_info = details.get("model_info", {})
print("Model details:")
for key in ["general.architecture", "general.parameter_count",
"llama.context_length", "llama.embedding_length"]:
val = model_info.get(key, details.get("details", {}).get(key, "?"))
print(f" {key}: {val}")
details_block = details.get("details", {})
print(f" quantization_level: {details_block.get('quantization_level', '?')}")
print(f" parameter_size: {details_block.get('parameter_size', '?')}")
# ββ Pull a model programmatically ββββββββββββββββββββββββββββββββββββββββββββββ
# Uncomment to pull a model β this may take several minutes
# pull_payload = {"name": "qwen2.5:7b", "stream": False}
# print("Pulling model...")
# pull_result = ollama_request("/api/pull", pull_payload)
# print(pull_result.get("status", "?"))
print("Pull command ready (uncomment to run).")
1.3 OpenAI-Compatible /v1 EndpointΒΆ
Ollama implements the OpenAI Chat Completions API. Just change base_url.
# Use the openai library pointing to local Ollama
ollama_client = OpenAI(
base_url="http://localhost:11434/v1",
api_key="ollama", # Required by the library but ignored by Ollama
)
# Identical to openai.chat.completions.create()
response = ollama_client.chat.completions.create(
model="llama3.2",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the difference between RAG and fine-tuning?"},
],
temperature=0.2,
max_tokens=200,
)
print(response.choices[0].message.content)
print(f"\nModel: {response.model}")
print(f"Tokens used: {response.usage.total_tokens}")
# Streaming with OpenAI client β Ollama
print("Streaming response from Ollama via OpenAI client:\n")
stream = ollama_client.chat.completions.create(
model="llama3.2",
messages=[{"role": "user", "content": "List 5 open-source LLMs and their best use cases."}],
stream=True,
max_tokens=300,
)
for chunk in stream:
delta = chunk.choices[0].delta.content
if delta:
print(delta, end="", flush=True)
print()
Part 2 β llama.cpp ServerΒΆ
The llama.cpp server is the fastest option for CPU-only inference. It uses GGUF models and exposes an OpenAI-compatible API.
InstallationΒΆ
# macOS (Homebrew)
brew install llama.cpp
# Build from source (Linux / macOS)
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
cmake -B build
cmake --build build --config Release
# With CUDA support (Linux)
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release
GGUF Models β Where to DownloadΒΆ
# Hugging Face β TheBloke and bartowski have curated GGUF collections
# Example: Llama 3.2 3B Q4_K_M
huggingface-cli download \
bartowski/Llama-3.2-3B-Instruct-GGUF \
Llama-3.2-3B-Instruct-Q4_K_M.gguf \
--local-dir ./models
# Or use wget directly from the HF CDN
wget https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/\
Llama-3.2-3B-Instruct-Q4_K_M.gguf -P ./models
Starting the ServerΒΆ
# Basic server (CPU)
llama-server \
--model ./models/Llama-3.2-3B-Instruct-Q4_K_M.gguf \
--port 8080 \
--ctx-size 4096
# With GPU acceleration (Metal on macOS / CUDA on Linux)
llama-server \
--model ./models/Llama-3.2-3B-Instruct-Q4_K_M.gguf \
--port 8080 \
--ctx-size 4096 \
--n-gpu-layers 35 # offload all layers to GPU
# Performance flags for CPU
llama-server \
--model ./models/Llama-3.2-3B-Instruct-Q4_K_M.gguf \
--port 8080 \
--threads 8 \
--threads-batch 8 \
--ctx-size 2048 \
--batch-size 512
# Connect to llama.cpp server (same OpenAI interface)
LLAMA_CPP_BASE = "http://localhost:8080" # default llama.cpp port
def check_server(base_url: str, name: str) -> bool:
"""Check if an API server is reachable."""
try:
requests.get(f"{base_url}/health", timeout=2)
print(f"{name}: UP at {base_url}")
return True
except:
try:
requests.get(base_url, timeout=2)
print(f"{name}: UP at {base_url}")
return True
except:
print(f"{name}: DOWN (not running at {base_url})")
return False
llama_cpp_up = check_server(LLAMA_CPP_BASE, "llama.cpp server")
if llama_cpp_up:
llama_cpp_client = OpenAI(
base_url=f"{LLAMA_CPP_BASE}/v1",
api_key="not-needed",
)
response = llama_cpp_client.chat.completions.create(
model="local", # llama.cpp ignores model name
messages=[{"role": "user", "content": "Hello, are you running locally?"}],
max_tokens=50,
)
print(response.choices[0].message.content)
else:
print("\nTo test this section, start llama.cpp server first:")
print(" llama-server --model ./model.gguf --port 8080")
GGUF Format and Quantisation NamingΒΆ
GGUF filenames encode quantisation level:
Llama-3.2-3B-Instruct-Q4_K_M.gguf
β β β
β β βββ S/M/L = size variant within the quant
β ββββ K = K-quant (better than legacy Q-quant)
βββββββ 4 = 4 bits per weight
Name |
Bits |
RAM vs fp16 |
Quality |
|---|---|---|---|
|
2.3 |
20% |
Poor |
|
4.4 |
28% |
Good |
|
4.8 |
30% |
Very good |
|
5.7 |
36% |
Great |
|
6.6 |
41% |
Excellent |
|
8.0 |
50% |
Near-perfect |
|
16 |
100% |
Reference |
Part 3 β LM StudioΒΆ
LM Studio is a GUI application that bundles a model browser, chat interface, and OpenAI-compatible API server. It runs on macOS, Windows, and Linux.
SetupΒΆ
Download from https://lmstudio.ai
Search for and download a model (e.g.
llama-3.2-3b-instruct)Go to Local Server tab β Start Server (default port 1234)
APIΒΆ
LM Studio exposes the same OpenAI-compatible API. The model name in requests is the file path shown in the UI.
# Connect to LM Studio (same API, different base_url)
LM_STUDIO_BASE = "http://localhost:1234" # LM Studio default
lm_studio_up = check_server(LM_STUDIO_BASE, "LM Studio")
if lm_studio_up:
lm_studio_client = OpenAI(
base_url=f"{LM_STUDIO_BASE}/v1",
api_key="lm-studio",
)
# List available models
models = lm_studio_client.models.list()
print("Models loaded in LM Studio:")
for m in models.data:
print(f" {m.id}")
# Chat
model_id = models.data[0].id if models.data else "local-model"
resp = lm_studio_client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": "Say hello from LM Studio!"}],
max_tokens=50,
)
print(resp.choices[0].message.content)
else:
print("\nTo test: open LM Studio β Local Server β Start Server")
Part 4 β vLLM: Best GPU ThroughputΒΆ
vLLM is the industry standard for serving LLMs on NVIDIA GPUs. It uses PagedAttention for maximum throughput and supports continuous batching.
Docker Setup (Recommended)ΒΆ
# Pull the vLLM Docker image
docker pull vllm/vllm-openai:latest
# Serve Llama 3.2 3B (HuggingFace model ID)
docker run --gpus all \
-p 8000:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--model meta-llama/Llama-3.2-3B-Instruct \
--max-model-len 4096
# With quantisation (saves GPU memory)
docker run --gpus all \
-p 8000:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--model meta-llama/Llama-3.2-3B-Instruct \
--quantization awq \
--max-model-len 4096
# Multi-GPU tensor parallelism
docker run --gpus all \
-p 8000:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--model meta-llama/Llama-3.3-70B-Instruct \
--tensor-parallel-size 4 \
--max-model-len 4096
pip installΒΆ
pip install vllm
vllm serve meta-llama/Llama-3.2-3B-Instruct --port 8000
# Connect to vLLM (same OpenAI client)
VLLM_BASE = "http://localhost:8000" # vLLM default
vllm_up = check_server(VLLM_BASE, "vLLM")
if vllm_up:
vllm_client = OpenAI(
base_url=f"{VLLM_BASE}/v1",
api_key="empty", # vLLM ignores the key
)
models = vllm_client.models.list()
model_name = models.data[0].id if models.data else "?"
print(f"Serving model: {model_name}")
response = vllm_client.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": "What are the advantages of vLLM over naive serving?"}],
max_tokens=200,
)
print(response.choices[0].message.content)
else:
print("\nTo test: docker run --gpus all -p 8000:8000 vllm/vllm-openai:latest --model ...")
4.1 Batch Inference with vLLMΒΆ
import concurrent.futures
def batch_completions(
client: OpenAI,
prompts: List[str],
model: str,
max_tokens: int = 100,
max_workers: int = 8,
) -> List[str]:
"""
Send multiple prompts concurrently.
vLLM handles continuous batching internally for efficiency.
"""
def single(prompt: str) -> str:
resp = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
)
return resp.choices[0].message.content
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
return list(executor.map(single, prompts))
# Example (runs against Ollama if vLLM is not available)
active_client = vllm_client if vllm_up else ollama_client
active_model = "llama3.2"
prompts = [
"Translate 'hello world' to French.",
"What is 2 + 2 * 3?",
"Name the capital of Japan.",
"Write a Python one-liner that reverses a string.",
]
start = time.perf_counter()
answers = batch_completions(active_client, prompts, model=active_model, max_tokens=60)
elapsed = time.perf_counter() - start
print(f"Completed {len(prompts)} requests in {elapsed:.2f}s")
for p, a in zip(prompts, answers):
print(f"\nQ: {p}")
print(f"A: {a.strip()}")
Part 5 β Benchmarking Local ServersΒΆ
Measure tokens per second and first-token latency across servers.
def benchmark_server(
client: OpenAI,
model: str,
prompt: str,
max_tokens: int = 200,
runs: int = 3,
) -> Dict:
"""
Measure latency and throughput for a server.
Returns:
avg_first_token_s: time to first token (TTFT)
avg_total_s: total generation time
avg_tokens_per_sec: estimated throughput
"""
first_token_times = []
total_times = []
token_counts = []
for _ in range(runs):
first_token_time = None
token_count = 0
start = time.perf_counter()
stream = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
stream=True,
)
for chunk in stream:
if chunk.choices[0].delta.content:
if first_token_time is None:
first_token_time = time.perf_counter() - start
token_count += 1
total_time = time.perf_counter() - start
first_token_times.append(first_token_time or total_time)
total_times.append(total_time)
token_counts.append(token_count)
avg_tps = sum(token_counts) / sum(total_times) if sum(total_times) > 0 else 0
return {
"avg_first_token_s": round(sum(first_token_times) / len(first_token_times), 3),
"avg_total_s": round(sum(total_times) / len(total_times), 3),
"avg_tokens_per_sec": round(avg_tps, 1),
"avg_output_tokens": round(sum(token_counts) / len(token_counts), 0),
}
BENCHMARK_PROMPT = "Explain the difference between supervised, unsupervised, and reinforcement learning."
print("Running benchmarks...\n")
servers = [
("Ollama", ollama_client, "llama3.2", True),
# ("llama.cpp", llama_cpp_client, "local", llama_cpp_up),
# ("vLLM", vllm_client, active_model, vllm_up),
]
benchmark_results = []
for name, client, model, is_up in servers:
if not is_up:
print(f"Skipping {name} (not running)")
continue
print(f"Benchmarking {name} ({model})...")
try:
stats = benchmark_server(client, model, BENCHMARK_PROMPT, runs=2)
stats["server"] = name
benchmark_results.append(stats)
print(f" TTFT: {stats['avg_first_token_s']}s | Total: {stats['avg_total_s']}s | {stats['avg_tokens_per_sec']} tok/s")
except Exception as e:
print(f" Error: {e}")
if benchmark_results:
print("\nSummary (sorted by throughput):")
for r in sorted(benchmark_results, key=lambda x: x["avg_tokens_per_sec"], reverse=True):
print(f" {r['server']:<15} TTFT={r['avg_first_token_s']}s | {r['avg_tokens_per_sec']} tok/s")
Part 6 β Building Applications That Work With Any ServerΒΆ
The key insight: all these servers share the OpenAI Chat Completions API. Write once, switch freely.
class LocalLLMClient:
"""
Unified client for any OpenAI-compatible local LLM server.
Transparently switches between Ollama, llama.cpp, LM Studio, and vLLM.
"""
PRESETS = {
"ollama": {"base_url": "http://localhost:11434/v1", "api_key": "ollama"},
"llama.cpp": {"base_url": "http://localhost:8080/v1", "api_key": "none"},
"lmstudio": {"base_url": "http://localhost:1234/v1", "api_key": "lm-studio"},
"vllm": {"base_url": "http://localhost:8000/v1", "api_key": "empty"},
}
def __init__(
self,
server: str = "ollama",
model: str = "llama3.2",
base_url: Optional[str] = None,
api_key: Optional[str] = None,
):
preset = self.PRESETS.get(server, {})
self.server = server
self.model = model
self.client = OpenAI(
base_url=base_url or preset.get("base_url"),
api_key=api_key or preset.get("api_key", "none"),
)
def chat(
self,
messages: List[Dict],
temperature: float = 0.7,
max_tokens: int = 512,
stream: bool = False,
) -> str:
"""Send messages and return the assistant reply."""
kwargs = dict(
model=self.model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
stream=stream,
)
if stream:
full = ""
for chunk in self.client.chat.completions.create(**kwargs):
delta = chunk.choices[0].delta.content or ""
print(delta, end="", flush=True)
full += delta
print()
return full
else:
resp = self.client.chat.completions.create(**kwargs)
return resp.choices[0].message.content
def complete(self, prompt: str, **kwargs) -> str:
"""Single-turn completion helper."""
return self.chat([{"role": "user", "content": prompt}], **kwargs)
def switch_server(self, server: str, model: Optional[str] = None):
"""Switch to a different backend at runtime."""
preset = self.PRESETS.get(server, {})
self.server = server
if model:
self.model = model
self.client = OpenAI(
base_url=preset.get("base_url"),
api_key=preset.get("api_key", "none"),
)
print(f"Switched to {server} ({self.model})")
# Create a client pointing to Ollama
llm = LocalLLMClient(server="ollama", model="llama3.2")
answer = llm.complete("What is PagedAttention in vLLM?")
print(answer)
# Demonstrate server switching at runtime
print("Using Ollama:")
llm.switch_server("ollama", model="llama3.2")
print(llm.complete("What is 7 Γ 8?", max_tokens=20))
# Switch to llama.cpp if running
if llama_cpp_up:
print("\nSwitching to llama.cpp:")
llm.switch_server("llama.cpp", model="local")
print(llm.complete("What is 7 Γ 8?", max_tokens=20))
# Switch to vLLM if running
if vllm_up:
print("\nSwitching to vLLM:")
llm.switch_server("vllm", model=active_model)
print(llm.complete("What is 7 Γ 8?", max_tokens=20))
Part 7 β Practical Application: Code Review BotΒΆ
class CodeReviewBot:
"""
Local code review assistant backed by any OpenAI-compatible server.
All code stays on your machine.
"""
SYSTEM_PROMPT = """You are an expert code reviewer. Analyse code for:
- Bugs and logic errors
- Security vulnerabilities
- Performance issues
- Style and readability
Be concise. Use bullet points."""
def __init__(self, client: LocalLLMClient):
self.client = client
def review(self, code: str, language: str = "python") -> str:
messages = [
{"role": "system", "content": self.SYSTEM_PROMPT},
{
"role": "user",
"content": f"Review this {language} code:\n\n```{language}\n{code}\n```",
},
]
return self.client.chat(messages, temperature=0.1, max_tokens=400)
# Sample code to review
buggy_code = """
import sqlite3
def get_user(username):
conn = sqlite3.connect('users.db')
cursor = conn.cursor()
query = "SELECT * FROM users WHERE username = '" + username + "'"
cursor.execute(query)
result = cursor.fetchall()
return result
def update_balance(user_id, amount):
conn = sqlite3.connect('users.db')
cursor = conn.cursor()
cursor.execute(f"UPDATE accounts SET balance = balance + {amount} WHERE id = {user_id}")
conn.commit()
"""
bot = CodeReviewBot(LocalLLMClient(server="ollama", model="llama3.2"))
print("Code Review (running locally):\n")
print(bot.review(buggy_code))
Part 8 β Hardware RecommendationsΒΆ
Apple M Series (Unified Memory)ΒΆ
Apple Silicon is the best consumer hardware for local LLMs because CPU and GPU share the same memory pool.
Chip |
Total RAM |
Recommended Models |
Perf. |
|---|---|---|---|
M1/M2 8GB |
~6GB usable |
llama3.2:3b, phi3:mini |
Slow |
M1/M2 16GB |
~13GB usable |
mistral:7b, llama3.2 |
Good |
M1 Max/M2 Pro 32GB |
~28GB usable |
phi4, qwen2.5:14b |
Very good |
M2 Ultra 64GB |
~56GB usable |
llama3.3:70b (q4) |
Excellent |
M3 Max 128GB |
~110GB usable |
Any model |
Exceptional |
Key flag for Ollama on Mac: models are automatically Metal-accelerated.
NVIDIA GPUs (CUDA)ΒΆ
GPU |
VRAM |
Recommended Models |
|---|---|---|
RTX 3060 |
12 GB |
7B at Q4 |
RTX 3090 |
24 GB |
13B at Q8 or 30B at Q4 |
RTX 4090 |
24 GB |
13B at fp16 or 34B at Q4 |
A100 40GB |
40 GB |
70B at Q4 |
A100 80GB |
80 GB |
70B at Q8 or fp16 |
Best server for NVIDIA: vLLM (highest throughput, continuous batching).
AMD GPUs (ROCm)ΒΆ
Ollama and llama.cpp support ROCm. Performance is close to CUDA for inference.
# Ollama with ROCm
HSA_OVERRIDE_GFX_VERSION=10.3.0 ollama serve
CPU-OnlyΒΆ
Use llama.cpp with Q4 models. Expect 1β5 tokens/sec on modern CPUs.
# Optimise for AVX2 CPU
llama-server --model model.gguf --threads $(nproc) --ctx-size 2048
import platform
import subprocess
def hardware_recommendations():
"""Detect hardware and print server + model recommendations."""
system = platform.system()
machine = platform.machine()
print(f"System: {system} | Architecture: {machine}\n")
if system == "Darwin" and machine == "arm64":
# Apple Silicon
try:
mem_bytes = int(
subprocess.check_output(["sysctl", "-n", "hw.memsize"]).strip()
)
ram_gb = mem_bytes / 1e9
except Exception:
ram_gb = 16 # default guess
print(f"Apple Silicon detected. Unified memory: {ram_gb:.0f} GB")
print("Recommended server: Ollama (Metal acceleration is automatic)\n")
if ram_gb >= 64:
print("Models: llama3.3:70b, qwen2.5:72b, deepseek-r1:70b")
elif ram_gb >= 32:
print("Models: phi4, qwen2.5:14b, gemma2:27b (q4)")
elif ram_gb >= 16:
print("Models: mistral, qwen2.5:7b, gemma2:9b")
else:
print("Models: llama3.2:3b, phi3:mini, gemma2:2b")
elif system == "Linux":
# Check for NVIDIA GPU
try:
nvidia_out = subprocess.check_output(
["nvidia-smi", "--query-gpu=name,memory.total",
"--format=csv,noheader"],
timeout=5,
).decode().strip()
print(f"NVIDIA GPU detected:\n {nvidia_out}")
print("Recommended server: vLLM (best throughput) or Ollama (easiest setup)")
except (FileNotFoundError, subprocess.CalledProcessError):
print("No NVIDIA GPU found.")
print("Recommended server: llama.cpp (optimised CPU inference)")
print("Models: llama3.2:3b (q4), phi3:mini")
elif system == "Windows":
print("Windows detected.")
print("Recommended: LM Studio (easiest GUI setup) or Ollama")
hardware_recommendations()
Part 9 β Server Selection GuideΒΆ
# Decision matrix
guide = [
{
"scenario": "Development / prototyping on Mac",
"server": "Ollama",
"why": "One command setup, automatic Metal GPU, great model library",
},
{
"scenario": "CPU-only server, maximum speed",
"server": "llama.cpp server",
"why": "GGUF kernels are the most optimised for CPU inference",
},
{
"scenario": "Non-technical users, Windows/macOS",
"server": "LM Studio",
"why": "GUI model browser, one-click server start",
},
{
"scenario": "Production GPU server, high concurrency",
"server": "vLLM",
"why": "PagedAttention + continuous batching = highest throughput",
},
{
"scenario": "Existing OpenAI code, swap to local",
"server": "Any (all OpenAI-compatible)",
"why": "Change base_url and api_key only",
},
{
"scenario": "Edge device / Raspberry Pi",
"server": "llama.cpp",
"why": "Minimal dependencies, highly optimised C++ inference",
},
]
print(f"{'Scenario':<40} {'Server':<25} {'Why'}")
print("-" * 100)
for row in guide:
print(f"{row['scenario']:<40} {row['server']:<25} {row['why']}")
Part 10 β Full Working Example: Local Chat ApplicationΒΆ
class LocalChatApp:
"""
Multi-turn chat application backed by any local LLM server.
Supports system prompts, conversation history, and server switching.
"""
def __init__(
self,
server: str = "ollama",
model: str = "llama3.2",
system_prompt: str = "You are a helpful assistant.",
):
self.llm = LocalLLMClient(server=server, model=model)
self.history: List[Dict] = [{"role": "system", "content": system_prompt}]
def send(self, message: str, stream: bool = True) -> str:
"""Send a message and get a streamed response."""
self.history.append({"role": "user", "content": message})
reply = self.llm.chat(self.history, stream=stream, max_tokens=300)
self.history.append({"role": "assistant", "content": reply})
return reply
def reset(self):
"""Clear conversation history (keep system prompt)."""
self.history = self.history[:1]
print("Conversation reset.")
def print_history(self):
"""Print the full conversation."""
for msg in self.history:
role = msg["role"].upper()
print(f"[{role}]: {msg['content'][:200]}..." if len(msg['content']) > 200 else f"[{role}]: {msg['content']}")
# Use it
app = LocalChatApp(
server="ollama",
model="llama3.2",
system_prompt="You are a concise Python expert. Keep answers short.",
)
print("Turn 1:")
app.send("What is a context manager in Python?", stream=False)
# Use stream=True for real-time token streaming
print("\n" + "-" * 60 + "\n")
print("Turn 2 (follow-up):")
reply2 = app.send("Can you show me a minimal example?", stream=False)
print(reply2)
print("\n" + "-" * 60)
print("\nConversation history:")
app.print_history()
Part 11 β Subprocess: Launch Servers ProgrammaticallyΒΆ
import shutil
def start_ollama_serve() -> Optional[subprocess.Popen]:
"""
Start Ollama in the background if it isn't already running.
Returns the process handle or None if already running.
"""
try:
requests.get("http://localhost:11434", timeout=1)
print("Ollama already running.")
return None
except:
pass
if shutil.which("ollama") is None:
print("ollama not found in PATH.")
return None
proc = subprocess.Popen(
["ollama", "serve"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
# Wait for server to be ready
for _ in range(15):
time.sleep(1)
try:
requests.get("http://localhost:11434", timeout=1)
print(f"Ollama started (PID {proc.pid}).")
return proc
except:
pass
print("Ollama did not start in time.")
return None
def start_llama_cpp_server(
model_path: str,
port: int = 8080,
n_gpu_layers: int = -1,
ctx_size: int = 4096,
) -> Optional[subprocess.Popen]:
"""
Start the llama.cpp server in the background.
Args:
model_path: Path to a GGUF model file.
port: TCP port to listen on.
n_gpu_layers: GPU layers to offload (-1 = all).
ctx_size: Context window size in tokens.
"""
if shutil.which("llama-server") is None:
print("llama-server not found. Install llama.cpp first.")
return None
cmd = [
"llama-server",
"--model", model_path,
"--port", str(port),
"--ctx-size", str(ctx_size),
"--n-gpu-layers", str(n_gpu_layers),
]
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
for _ in range(30):
time.sleep(1)
try:
requests.get(f"http://localhost:{port}/health", timeout=1)
print(f"llama.cpp server started on port {port} (PID {proc.pid}).")
return proc
except:
pass
print("llama.cpp server did not start in time.")
return None
# Ensure Ollama is running
proc = start_ollama_serve()
# Example: start llama.cpp (replace with your GGUF path)
# llama_proc = start_llama_cpp_server("./models/Llama-3.2-3B-Instruct-Q4_K_M.gguf")
Key TakeawaysΒΆ
All four servers (Ollama, llama.cpp, LM Studio, vLLM) expose an OpenAI-compatible
/v1/chat/completionsendpointTo switch servers: change
base_urlin yourOpenAI()client β no other code changes neededOllama is the easiest starting point for development;
vLLMis best for GPU productionllama.cpp server wins on CPU-only machines; LM Studio is best for non-technical users
Apple Silicon users get free Metal GPU acceleration via Ollama
GGUF Q4_K_M is the recommended default quantisation β minimal quality loss at 25% of fp16 memory
Benchmarking matters β first-token latency (TTFT) and tokens/sec vary significantly by server and hardware
Next StepsΒΆ
03_local_rag_with_ollama.ipynbβ Build a fully local RAG pipelineAdd authentication to your local server for network deployments
Explore OpenWebUI for a full ChatGPT-like interface over any local server
Try
vllm serveon a cloud GPU instance (A10, L4, A100) for production throughput