# Install required packages
!pip install -q transformers torch accelerate sentencepiece protobuf pandas matplotlib plotly
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
Part 1: Major Open Source Model FamiliesΒΆ
The open-source LLM ecosystem has expanded rapidly, with each model family offering distinct trade-offs in size, capability, licensing, and specialization. The table below catalogs the major models with their parameter counts, context lengths, benchmark scores, and license types. Understanding this landscape is essential for making informed decisions about which model to deploy β a choice that affects inference cost, hardware requirements, output quality, and legal compliance. The MMLU (Massive Multitask Language Understanding) score provides a rough proxy for general knowledge, though task-specific benchmarks are always more relevant for your use case.
# Comprehensive model database
models_data = {
'Model': [
# Llama family
'Llama 2 7B', 'Llama 2 13B', 'Llama 2 70B',
'Llama 3 8B', 'Llama 3 70B',
# Mistral family
'Mistral 7B', 'Mixtral 8x7B', 'Mixtral 8x22B',
# Falcon
'Falcon 7B', 'Falcon 40B', 'Falcon 180B',
# Phi
'Phi-2', 'Phi-3 Mini', 'Phi-3 Medium',
# Gemma
'Gemma 2B', 'Gemma 7B',
# Others
'Yi 34B', 'Qwen 72B', 'DeepSeek 67B'
],
'Parameters': [
'7B', '13B', '70B',
'8B', '70B',
'7B', '47B', '141B',
'7B', '40B', '180B',
'2.7B', '3.8B', '14B',
'2B', '7B',
'34B', '72B', '67B'
],
'Context_Length': [
4096, 4096, 4096,
8192, 8192,
8192, 32768, 65536,
2048, 2048, 2048,
2048, 4096, 8192,
8192, 8192,
4096, 32768, 4096
],
'License': [
'Llama 2', 'Llama 2', 'Llama 2',
'Llama 3', 'Llama 3',
'Apache 2.0', 'Apache 2.0', 'Apache 2.0',
'Apache 2.0', 'Apache 2.0', 'TII Falcon',
'MIT', 'MIT', 'MIT',
'Gemma', 'Gemma',
'Apache 2.0', 'Tongyi Qianwen', 'MIT'
],
'Release_Date': [
'2023-07', '2023-07', '2023-07',
'2024-04', '2024-04',
'2023-09', '2023-12', '2024-04',
'2023-03', '2023-05', '2023-09',
'2023-12', '2024-04', '2024-04',
'2024-02', '2024-02',
'2023-11', '2023-11', '2024-01'
],
'Commercial_Use': [
'Yes*', 'Yes*', 'Yes*',
'Yes', 'Yes',
'Yes', 'Yes', 'Yes',
'Yes', 'Yes', 'Restricted',
'Yes', 'Yes', 'Yes',
'Yes', 'Yes',
'Yes', 'Restricted', 'Yes'
],
'MMLU_Score': [
45.3, 54.8, 68.9,
66.6, 79.5,
60.1, 70.6, 77.8,
53.0, 55.4, 68.7,
56.3, 69.1, 75.3,
42.3, 64.3,
72.2, 77.4, 71.3
]
}
df_models = pd.DataFrame(models_data)
print("Open Source LLM Landscape:\n")
df_models
# Visualize model performance vs size
# Extract numeric parameters
df_models['Param_Numeric'] = df_models['Parameters'].str.extract('(\d+)').astype(float)
fig = px.scatter(
df_models,
x='Param_Numeric',
y='MMLU_Score',
size='Context_Length',
color='License',
hover_data=['Model', 'Commercial_Use'],
log_x=True,
title='Open Source Models: Performance vs Size',
labels={'Param_Numeric': 'Parameters (Billions)', 'MMLU_Score': 'MMLU Score (%)'}
)
fig.update_layout(height=600)
fig.show()
print("\nπ Key Insights:")
print("- Larger models generally perform better (but not always!)")
print("- Mistral/Mixtral punch above their weight class")
print("- Phi models are efficient small models")
print("- Context length varies significantly (2K to 65K tokens)")
Part 2: Model Family Deep DivesΒΆ
Llama Family (Meta)ΒΆ
Llama 2:
Released July 2023
Sizes: 7B, 13B, 70B
License: Llama 2 (free for commercial use with restrictions)
Strengths: Well-rounded, strong performance
Weaknesses: Restrictive license for large deployments
Llama 3:
Released April 2024
Sizes: 8B, 70B
License: Llama 3 (more permissive)
Improvements: Better reasoning, coding, multilingual
8K context window (vs 4K in Llama 2)
Mistral FamilyΒΆ
Mistral 7B:
Apache 2.0 license (fully open)
Outperforms Llama 2 13B despite being smaller
Excellent efficiency
Great for fine-tuning
Mixtral (MoE):
Mixture of Experts architecture
8x7B: 47B total params, but only uses 13B per token
Very efficient inference
Massive context windows (32K-64K)
Microsoft PhiΒΆ
Tiny but mighty models
Phi-2: 2.7B parameters, rivals 7B models
Trained on high-quality data (βtextbookβ approach)
Excellent for edge devices and local deployment
MIT license
Google GemmaΒΆ
Based on Gemini architecture
2B and 7B sizes
Strong safety features built-in
Good instruction following
Terms of use required
# Compare top models by use case
use_cases = {
'Use Case': [
'General Chat', 'Code Generation', 'Reasoning',
'Edge Deployment', 'Privacy-Sensitive', 'Long Context',
'Multilingual', 'Fine-tuning Base'
],
'Best Model': [
'Llama 3 70B', 'Llama 3 70B', 'Mixtral 8x22B',
'Phi-3 Mini', 'Llama 3 8B', 'Mixtral 8x22B',
'Qwen 72B', 'Mistral 7B'
],
'Alternative': [
'Mixtral 8x7B', 'DeepSeek 67B', 'Llama 3 70B',
'Gemma 2B', 'Mistral 7B', 'Yi 34B',
'Llama 3 70B', 'Llama 2 7B'
],
'Why': [
'Excellent overall performance', 'Strong at code tasks',
'Best reasoning capabilities', 'Smallest with good performance',
'Can self-host, no API calls', 'Massive 65K context',
'Trained on diverse languages', 'Clean, well-documented base'
]
}
df_use_cases = pd.DataFrame(use_cases)
print("\nπ― Model Selection by Use Case:\n")
df_use_cases
Part 3: Licensing ConsiderationsΒΆ
Licensing is one of the most overlooked aspects of open-source model selection, yet it determines whether you can legally deploy a model in your product. The spectrum ranges from fully permissive licenses (Apache 2.0, MIT) that allow unrestricted commercial use, to restrictive licenses that impose revenue limits or prohibit using model outputs to train competing models. Always review the specific license terms before committing to a model for production use β switching models later due to licensing issues is far more expensive than choosing correctly upfront.
license_comparison = {
'License': [
'Apache 2.0', 'MIT', 'Llama 2', 'Llama 3',
'Gemma Terms', 'Falcon TII'
],
'Commercial_Use': [
'Yes', 'Yes', 'Yes (with limits)', 'Yes',
'Yes (T&C apply)', 'Restricted'
],
'Modifications': [
'Yes', 'Yes', 'Yes', 'Yes',
'Yes', 'Yes'
],
'Redistribution': [
'Yes', 'Yes', 'Yes', 'Yes',
'With attribution', 'Restricted'
],
'Revenue_Limit': [
'None', 'None', '700M MAU', 'None',
'None', 'N/A'
],
'Examples': [
'Mistral, Falcon 7/40', 'Phi, DeepSeek', 'Llama 2 all sizes',
'Llama 3 all sizes', 'Gemma 2B/7B', 'Falcon 180B'
]
}
df_licenses = pd.DataFrame(license_comparison)
print("License Comparison:\n")
df_licenses
β οΈ Important License NotesΒΆ
Apache 2.0 & MIT:
β Most permissive
β No usage restrictions
β Recommended for startups
Llama 2:
β οΈ Free if < 700M monthly active users
β οΈ Need special license from Meta if larger
β οΈ Cannot use outputs to improve other LLMs
Llama 3:
β Removed MAU restriction
β More permissive than Llama 2
β οΈ Still has some restrictions
Always read the full license before deployment!
Part 4: Performance BenchmarksΒΆ
Benchmarks provide standardized comparisons across models, but they must be interpreted carefully. MMLU measures broad knowledge across 57 subjects, HellaSwag tests commonsense reasoning, TruthfulQA evaluates factual accuracy, HumanEval measures Python code generation ability, and GSM8K tests grade-school math reasoning. A model that scores highest on MMLU may underperform on code tasks, so always prioritize benchmarks that align with your actual use case. The heatmap below makes it easy to spot each modelβs strengths and weaknesses at a glance.
# Common benchmarks
benchmarks_data = {
'Model': [
'Llama 3 70B', 'Mixtral 8x22B', 'Qwen 72B',
'Llama 2 70B', 'Yi 34B', 'Mistral 7B',
'Llama 3 8B', 'Phi-3 Medium', 'Gemma 7B'
],
'MMLU': [79.5, 77.8, 77.4, 68.9, 72.2, 60.1, 66.6, 75.3, 64.3],
'HellaSwag': [87.1, 86.0, 85.7, 85.3, 84.9, 81.3, 82.1, 83.2, 81.9],
'TruthfulQA': [51.2, 49.8, 48.3, 44.9, 46.7, 42.1, 45.3, 47.8, 43.2],
'HumanEval': [67.1, 75.0, 68.2, 29.9, 67.8, 40.2, 62.2, 54.7, 51.8],
'GSM8K': [93.0, 88.6, 91.1, 56.8, 83.7, 52.2, 79.6, 86.7, 59.8]
}
df_bench = pd.DataFrame(benchmarks_data)
print("Benchmark Scores (Higher is Better):\n")
print(df_bench.to_string(index=False))
# Heatmap
fig = go.Figure(data=go.Heatmap(
z=df_bench.iloc[:, 1:].values,
x=['MMLU', 'HellaSwag', 'TruthfulQA', 'HumanEval (Code)', 'GSM8K (Math)'],
y=df_bench['Model'],
colorscale='RdYlGn',
text=df_bench.iloc[:, 1:].values,
texttemplate='%{text:.1f}',
textfont={"size": 10},
colorbar=dict(title="Score")
))
fig.update_layout(
title='Model Benchmark Comparison',
height=500,
xaxis_title='Benchmark',
yaxis_title='Model'
)
fig.show()
print("\nπ Benchmark Explanations:")
print("- MMLU: Multitask Language Understanding (general knowledge)")
print("- HellaSwag: Common sense reasoning")
print("- TruthfulQA: Ability to answer truthfully")
print("- HumanEval: Code generation accuracy")
print("- GSM8K: Grade school math problems")
Part 5: Model Selection FrameworkΒΆ
Choosing the right model requires balancing multiple constraints: maximum parameter count (determined by your GPU), license requirements (determined by your business model), minimum context length (determined by your input data), and target benchmark performance. The select_model() function below filters the model database against your constraints and returns the top candidates sorted by MMLU score. In practice, you should further narrow the shortlist by running evaluation on your own dataset, since benchmark scores do not always predict performance on domain-specific tasks.
def select_model(use_case, constraints):
"""
Model selection helper function.
Args:
use_case: str - 'chat', 'code', 'reasoning', 'edge', etc.
constraints: dict - {'max_size': '13B', 'license': 'apache', 'min_context': 8192}
"""
recommendations = []
# Parse constraints
max_size = constraints.get('max_size', '70B')
license_req = constraints.get('license', 'any').lower()
min_context = constraints.get('min_context', 0)
need_commercial = constraints.get('commercial', True)
# Filter models
filtered = df_models.copy()
# Size filter
max_size_num = float(max_size.replace('B', ''))
filtered = filtered[filtered['Param_Numeric'] <= max_size_num]
# Context filter
filtered = filtered[filtered['Context_Length'] >= min_context]
# License filter
if license_req != 'any':
filtered = filtered[filtered['License'].str.lower().str.contains(license_req)]
# Commercial filter
if need_commercial:
filtered = filtered[filtered['Commercial_Use'].str.contains('Yes')]
# Sort by performance
filtered = filtered.sort_values('MMLU_Score', ascending=False)
# Use case specific recommendations
use_case = use_case.lower()
if use_case == 'code':
print("\nπ― Code Generation Recommendation:")
print("Top choice: Look for high HumanEval scores")
print("Suggested: Llama 3 70B, Mixtral 8x22B, DeepSeek 67B")
elif use_case == 'reasoning':
print("\nπ― Reasoning Task Recommendation:")
print("Top choice: Mixtral 8x22B or Llama 3 70B")
elif use_case == 'edge':
print("\nπ― Edge Deployment Recommendation:")
print("Top choice: Phi-3 Mini (3.8B) or Gemma 2B")
print("These run on laptops and mobile devices!")
elif use_case == 'chat':
print("\nπ― Chat Application Recommendation:")
print("Top choice: Llama 3 8B for efficiency, Llama 3 70B for quality")
print(f"\nπ Models matching your constraints (top 5):\n")
return filtered.head(5)[['Model', 'Parameters', 'MMLU_Score', 'License', 'Context_Length']]
# Example: Select model for chat with constraints
result = select_model(
use_case='chat',
constraints={
'max_size': '13B',
'license': 'apache',
'min_context': 8000,
'commercial': True
}
)
result
Part 6: Deployment ConsiderationsΒΆ
The gap between βmodel works in a notebookβ and βmodel serves production trafficβ is bridged by understanding hardware requirements, quantization trade-offs, and cost economics. A 70B parameter model in FP16 needs 140GB of VRAM β far beyond any single consumer GPU β but 4-bit quantization reduces this to 35GB, making it feasible on a single A100. The cost comparison between local deployment, cloud hosting, and proprietary APIs reveals that open-source models become dramatically cheaper at high volumes (10M+ tokens per month), while proprietary APIs are simpler and more cost-effective at low volumes.
# Hardware requirements
hardware_reqs = {
'Model Size': ['2-3B', '7B', '13B', '34B', '70B'],
'Min_VRAM_FP16': ['6 GB', '16 GB', '32 GB', '70 GB', '140 GB'],
'Min_VRAM_8bit': ['4 GB', '8 GB', '16 GB', '35 GB', '70 GB'],
'Min_VRAM_4bit': ['2 GB', '4 GB', '8 GB', '18 GB', '35 GB'],
'Typical_Device': [
'Laptop GPU', 'Consumer GPU (RTX 3090)',
'High-end GPU (RTX 4090)', 'A100 40GB',
'Multi-GPU or A100 80GB'
],
'Tokens_per_sec': ['50-100', '20-40', '10-20', '5-10', '2-5']
}
df_hw = pd.DataFrame(hardware_reqs)
print("Hardware Requirements by Model Size:\n")
df_hw
π‘ Optimization TechniquesΒΆ
Quantization:
FP16: Half precision, ~50% size reduction
8-bit: ~75% size reduction, minimal quality loss
4-bit (GPTQ/GGUF): ~87.5% size reduction, some quality loss
Other Optimizations:
Flash Attention: Faster inference
LoRA: Efficient fine-tuning
vLLM: Optimized serving
llama.cpp: CPU-optimized inference
Trade-offs:
Lower precision = faster, less memory, slight quality drop
Smaller models = faster, less capable
Always test your specific use case!
# Cost comparison (rough estimates for cloud deployment)
cost_data = {
'Deployment': [
'Llama 3 8B (local)', 'Llama 3 8B (cloud)',
'Llama 3 70B (local)', 'Llama 3 70B (cloud)',
'GPT-3.5 Turbo (API)', 'GPT-4 (API)'
],
'Setup_Cost': [0, 0, 0, 0, 0, 0],
'Monthly_Cost_Low': [0, 50, 0, 500, 10, 30],
'Monthly_Cost_High': [0, 200, 0, 2000, 500, 5000],
'Cost_per_1M_tokens': [0, 0.10, 0, 0.50, 1.50, 30.00],
'Control': ['Full', 'Full', 'Full', 'Full', 'None', 'None'],
'Privacy': ['Max', 'High', 'Max', 'High', 'Low', 'Low']
}
df_cost = pd.DataFrame(cost_data)
print("\nπ° Deployment Cost Comparison (Monthly USD):\n")
print(df_cost.to_string(index=False))
print("\nπ― Decision Factors:")
print("\nChoose Open Source Local if:")
print(" β
Privacy is critical")
print(" β
High volume usage (>10M tokens/month)")
print(" β
Need full control and customization")
print(" β
Have GPU infrastructure")
print("\nChoose Open Source Cloud if:")
print(" β
Moderate volume")
print(" β
Want open source benefits without hardware")
print(" β
Need flexibility")
print("\nChoose Proprietary API if:")
print(" β
Low initial volume")
print(" β
Need absolute best quality")
print(" β
Want zero ops overhead")
print(" β
Privacy not primary concern")
π― Key TakeawaysΒΆ
No single βbestβ model - Choose based on your specific needs
Licensing matters - Apache 2.0/MIT most permissive
Benchmarks are guides - Test on YOUR data
Size isnβt everything - Mistral 7B outperforms some 13B models
Quantization enables deployment - 4-bit can run 70B models on consumer GPUs
Context length varies - 2K to 65K tokens
New models released constantly - Stay updated!
π Practice ExercisesΒΆ
Model Selection:
Define requirements for a customer service chatbot
Use the selection framework to choose top 3 models
Justify your choices
Cost Analysis:
Calculate deployment costs for 1M, 10M, 100M tokens/month
Compare open source vs proprietary
Determine break-even point
Benchmark Research:
Find latest benchmark scores for a model family
Compare across versions (e.g., Llama 2 vs Llama 3)
Identify improvements
π ResourcesΒΆ
Model Hubs:
Benchmarks:
Tools: