import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_moons
from sklearn.model_selection import train_test_split
np.random.seed(42)
print("β
Imports successful!")
What is a Neuron?ΒΆ
A neuron is a simple function:
output = activation(weights Β· inputs + bias)
Components:
Inputs: Data features (e.g., pixel values, word embeddings)
Weights: Learned parameters that determine importance
Bias: Learned offset
Activation: Non-linear function (ReLU, Sigmoid, etc.)
Letβs implement a single neuron:
def sigmoid(x):
"""Sigmoid activation: maps any value to (0, 1)"""
return 1 / (1 + np.exp(-x))
def neuron(inputs, weights, bias):
"""
A single neuron
Args:
inputs: array of input features
weights: array of weights (same shape as inputs)
bias: single number
Returns:
output: activated sum
"""
# Weighted sum
z = np.dot(weights, inputs) + bias
# Activation
output = sigmoid(z)
return output
# Example: A neuron that classifies (hot, sunny) vs (cold, rainy)
inputs = np.array([80, 8]) # [temperature, sunshine_hours]
weights = np.array([0.02, 0.1]) # learned values
bias = -1.5
output = neuron(inputs, weights, bias)
print(f"Input: Temperature={inputs[0]}Β°F, Sunshine={inputs[1]} hours")
print(f"Output: {output:.4f}")
print(f"Prediction: {'Good weather!' if output > 0.5 else 'Bad weather'}")
2. Activation FunctionsΒΆ
Activation functions add non-linearity to neural networks. Without them, multiple layers would collapse into a single linear transformation.
Common Activation FunctionsΒΆ
def relu(x):
"""ReLU: Rectified Linear Unit - most popular for hidden layers"""
return np.maximum(0, x)
def sigmoid(x):
"""Sigmoid: maps to (0, 1) - good for binary classification output"""
return 1 / (1 + np.exp(-np.clip(x, -500, 500))) # clip to prevent overflow
def tanh(x):
"""Tanh: maps to (-1, 1) - centered around 0"""
return np.tanh(x)
# Visualize activation functions
x = np.linspace(-5, 5, 100)
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# ReLU
axes[0].plot(x, relu(x), 'b-', linewidth=2)
axes[0].set_title('ReLU: max(0, x)', fontsize=14)
axes[0].grid(True, alpha=0.3)
axes[0].axhline(y=0, color='k', linestyle='--', alpha=0.3)
axes[0].axvline(x=0, color='k', linestyle='--', alpha=0.3)
axes[0].set_xlabel('x')
axes[0].set_ylabel('ReLU(x)')
# Sigmoid
axes[1].plot(x, sigmoid(x), 'r-', linewidth=2)
axes[1].set_title('Sigmoid: 1/(1+e^-x)', fontsize=14)
axes[1].grid(True, alpha=0.3)
axes[1].axhline(y=0.5, color='k', linestyle='--', alpha=0.3)
axes[1].axvline(x=0, color='k', linestyle='--', alpha=0.3)
axes[1].set_xlabel('x')
axes[1].set_ylabel('Sigmoid(x)')
# Tanh
axes[2].plot(x, tanh(x), 'g-', linewidth=2)
axes[2].set_title('Tanh: (e^x - e^-x)/(e^x + e^-x)', fontsize=14)
axes[2].grid(True, alpha=0.3)
axes[2].axhline(y=0, color='k', linestyle='--', alpha=0.3)
axes[2].axvline(x=0, color='k', linestyle='--', alpha=0.3)
axes[2].set_xlabel('x')
axes[2].set_ylabel('Tanh(x)')
plt.tight_layout()
plt.show()
print("\nπ Key Properties:")
print("\nReLU:")
print(" - Most popular for hidden layers")
print(" - Fast to compute")
print(" - Helps with vanishing gradient problem")
print(" - Output: [0, β)")
print("\nSigmoid:")
print(" - Good for binary classification (output layer)")
print(" - Outputs probability (0 to 1)")
print(" - Can suffer from vanishing gradients")
print("\nTanh:")
print(" - Similar to sigmoid but centered at 0")
print(" - Output: (-1, 1)")
print(" - Better than sigmoid for hidden layers")
3. Building a Neural Network LayerΒΆ
A layer is a collection of neurons. Each neuron receives the same inputs but has different weights.
Input Layer Hidden Layer Output Layer
x1 βββββββββ h1 βββββββββ y1
x2 βββββββββ h2 βββββββββ y2
x3 βββββββββ h3 βββββββββ
class Layer:
"""A fully connected neural network layer"""
def __init__(self, n_inputs, n_neurons, activation='relu'):
"""
Args:
n_inputs: number of input features
n_neurons: number of neurons in this layer
activation: 'relu', 'sigmoid', or 'tanh'
"""
# Initialize weights: small random values
# Shape: (n_neurons, n_inputs)
self.weights = np.random.randn(n_neurons, n_inputs) * 0.01
# Initialize biases: zeros
# Shape: (n_neurons, 1)
self.biases = np.zeros((n_neurons, 1))
# Set activation function
self.activation_name = activation
if activation == 'relu':
self.activation = relu
elif activation == 'sigmoid':
self.activation = sigmoid
elif activation == 'tanh':
self.activation = tanh
def forward(self, inputs):
"""
Forward pass through the layer
Args:
inputs: shape (n_inputs, n_samples)
Returns:
output: shape (n_neurons, n_samples)
"""
# Linear transformation: WΒ·x + b
z = np.dot(self.weights, inputs) + self.biases
# Apply activation function
output = self.activation(z)
# Store for backpropagation (later)
self.inputs = inputs
self.z = z
self.output = output
return output
# Example: Create a layer
layer = Layer(n_inputs=3, n_neurons=4, activation='relu')
# Forward pass with one sample
x = np.array([[1.0], [2.0], [3.0]]) # 3 features
output = layer.forward(x)
print("Layer configuration:")
print(f" Inputs: {layer.weights.shape[1]}")
print(f" Neurons: {layer.weights.shape[0]}")
print(f" Activation: {layer.activation_name}")
print(f"\nWeights shape: {layer.weights.shape}")
print(f"Biases shape: {layer.biases.shape}")
print(f"\nInput shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"\nOutput values:\n{output}")
4. Building a Complete Neural NetworkΒΆ
Stacking Layers into a Multi-Layer PerceptronΒΆ
A single layer can only learn a limited family of functions. By stacking multiple layers β each applying a linear transformation followed by a non-linear activation β we create a multi-layer perceptron (MLP) that can approximate arbitrarily complex functions (this is the Universal Approximation Theorem).
Data flows through the network in a single direction during the forward pass: each layer receives the previous layerβs output, transforms it, and passes the result onward. The final layer typically uses a sigmoid (for binary classification) or softmax (for multi-class) activation to produce a probability. The intermediate βhiddenβ layers use ReLU or similar activations to learn increasingly abstract representations of the input features.
class NeuralNetwork:
"""A simple feedforward neural network"""
def __init__(self, layer_sizes, activations):
"""
Args:
layer_sizes: list of layer sizes [input, hidden1, hidden2, ..., output]
activations: list of activation functions for each layer
"""
self.layers = []
for i in range(len(layer_sizes) - 1):
layer = Layer(
n_inputs=layer_sizes[i],
n_neurons=layer_sizes[i + 1],
activation=activations[i]
)
self.layers.append(layer)
def forward(self, X):
"""
Forward pass through all layers
Args:
X: input data, shape (n_features, n_samples)
Returns:
output: final layer output
"""
output = X
for layer in self.layers:
output = layer.forward(output)
return output
def __repr__(self):
"""Print network architecture"""
lines = ["Neural Network Architecture:"]
for i, layer in enumerate(self.layers):
lines.append(f" Layer {i+1}: {layer.weights.shape[1]} β {layer.weights.shape[0]} ({layer.activation_name})")
return "\n".join(lines)
# Create a network: 2 inputs β 4 hidden neurons β 3 hidden neurons β 1 output
nn = NeuralNetwork(
layer_sizes=[2, 4, 3, 1],
activations=['relu', 'relu', 'sigmoid']
)
print(nn)
# Test forward pass
X_test = np.random.randn(2, 5) # 2 features, 5 samples
output = nn.forward(X_test)
print(f"\nInput shape: {X_test.shape}")
print(f"Output shape: {output.shape}")
print(f"Output values (probabilities):\n{output.T}")
5. Training a Neural NetworkΒΆ
Training involves:
Forward pass: Compute predictions
Compute loss: How wrong are we?
Backward pass: Compute gradients (next notebook!)
Update weights: Move towards better predictions
For now, weβll use a simple update rule. In the next notebook, weβll implement proper backpropagation.
# Generate binary classification data
np.random.seed(42)
X, y = make_moons(n_samples=300, noise=0.2, random_state=42)
X = X.T # Shape: (2, 300)
y = y.reshape(1, -1) # Shape: (1, 300)
# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
X.T, y.T, test_size=0.2, random_state=42
)
X_train, X_test = X_train.T, X_test.T
y_train, y_test = y_train.T, y_test.T
print(f"Training data: {X_train.shape}")
print(f"Training labels: {y_train.shape}")
# Visualize data
plt.figure(figsize=(8, 6))
plt.scatter(X_train[0, y_train[0]==0], X_train[1, y_train[0]==0],
c='blue', label='Class 0', alpha=0.6, edgecolors='k')
plt.scatter(X_train[0, y_train[0]==1], X_train[1, y_train[0]==1],
c='red', label='Class 1', alpha=0.6, edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Binary Classification Data (Moons)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
Loss FunctionΒΆ
For binary classification, we use Binary Cross-Entropy:
Where:
\(y_i\) is the true label (0 or 1)
\(\hat{y}_i\) is the predicted probability
\(m\) is the number of samples
def binary_cross_entropy(y_true, y_pred):
"""
Compute binary cross-entropy loss
Args:
y_true: true labels, shape (1, n_samples)
y_pred: predicted probabilities, shape (1, n_samples)
Returns:
loss: scalar
"""
m = y_true.shape[1]
# Clip predictions to prevent log(0)
y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
# Compute loss
loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
return loss
def accuracy(y_true, y_pred):
"""
Compute accuracy
"""
predictions = (y_pred > 0.5).astype(float)
return np.mean(predictions == y_true)
# Test loss function
y_true_test = np.array([[0, 1, 1, 0]])
y_pred_test = np.array([[0.1, 0.9, 0.8, 0.2]])
loss = binary_cross_entropy(y_true_test, y_pred_test)
acc = accuracy(y_true_test, y_pred_test)
print(f"Example loss: {loss:.4f}")
print(f"Example accuracy: {acc:.4f}")
Simple Training LoopΒΆ
Gradient descent works by nudging each weight in the direction that reduces the loss. At each step we compute the gradient \(\frac{\partial L}{\partial w}\) for every weight, then update: \(w \leftarrow w - \alpha \frac{\partial L}{\partial w}\), where \(\alpha\) is the learning rate. The loop below uses a simplified gradient calculation to demonstrate the concept. In the next notebook we will implement backpropagation, the efficient algorithm that computes exact gradients for every parameter in a single backward pass through the networkβs computational graph.
def train_simple(nn, X, y, learning_rate=0.1, epochs=1000):
"""
Simple training loop with numerical gradients
(We'll do proper backpropagation in the next notebook)
"""
losses = []
accuracies = []
for epoch in range(epochs):
# Forward pass
y_pred = nn.forward(X)
# Compute loss
loss = binary_cross_entropy(y, y_pred)
acc = accuracy(y, y_pred)
losses.append(loss)
accuracies.append(acc)
# Simple weight update (not optimal, but demonstrates the concept)
# In next notebook, we'll implement proper backpropagation
for layer in nn.layers:
# Compute gradient of loss w.r.t. output
error = y_pred - y
# Update weights (simplified)
grad_weights = np.dot(error, layer.inputs.T) / X.shape[1]
grad_biases = np.mean(error, axis=1, keepdims=True)
layer.weights -= learning_rate * grad_weights
layer.biases -= learning_rate * grad_biases
if (epoch + 1) % 200 == 0:
print(f"Epoch {epoch+1}/{epochs} - Loss: {loss:.4f}, Accuracy: {acc:.4f}")
return losses, accuracies
# Create and train network
print("Creating neural network...\n")
nn = NeuralNetwork(
layer_sizes=[2, 8, 8, 1], # 2 inputs β 8 β 8 β 1 output
activations=['relu', 'relu', 'sigmoid']
)
print(nn)
print("\nTraining...\n")
losses, accuracies = train_simple(
nn, X_train, y_train,
learning_rate=0.5,
epochs=1000
)
print("\nβ
Training complete!")
# Plot training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# Loss curve
ax1.plot(losses, 'b-', linewidth=2)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss (Binary Cross-Entropy)')
ax1.set_title('Training Loss')
ax1.grid(True, alpha=0.3)
# Accuracy curve
ax2.plot(accuracies, 'g-', linewidth=2)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training Accuracy')
ax2.grid(True, alpha=0.3)
ax2.set_ylim([0, 1])
plt.tight_layout()
plt.show()
# Test set evaluation
y_pred_test = nn.forward(X_test)
test_loss = binary_cross_entropy(y_test, y_pred_test)
test_acc = accuracy(y_test, y_pred_test)
print(f"\nπ Test Set Performance:")
print(f" Loss: {test_loss:.4f}")
print(f" Accuracy: {test_acc*100:.2f}%")
6. Visualizing Decision BoundariesΒΆ
Seeing What the Network LearnedΒΆ
One of the most intuitive ways to evaluate a classifier is to visualize its decision boundary β the surface in feature space where the predicted class changes. For 2D input data we can evaluate the network on a dense grid of points and color-code the predicted probability. A network that has learned the underlying pattern will show a smooth, well-separated boundary that closely follows the true data distribution. Jagged or overly complex boundaries may indicate overfitting, while boundaries that are too simple suggest underfitting. Comparing decision boundaries on training versus test data reveals how well the model generalizes to unseen examples.
def plot_decision_boundary(nn, X, y, title="Decision Boundary"):
"""
Plot the decision boundary learned by the neural network
"""
# Create mesh
x_min, x_max = X[0, :].min() - 0.5, X[0, :].max() + 0.5
y_min, y_max = X[1, :].min() - 0.5, X[1, :].max() + 0.5
xx, yy = np.meshgrid(
np.linspace(x_min, x_max, 200),
np.linspace(y_min, y_max, 200)
)
# Predict on mesh
mesh_data = np.c_[xx.ravel(), yy.ravel()].T
Z = nn.forward(mesh_data)
Z = Z.reshape(xx.shape)
# Plot
plt.figure(figsize=(10, 8))
# Decision boundary
plt.contourf(xx, yy, Z, levels=20, cmap='RdYlBu', alpha=0.6)
plt.colorbar(label='Predicted Probability')
# Data points
plt.scatter(X[0, y[0]==0], X[1, y[0]==0],
c='blue', label='Class 0', alpha=0.8, edgecolors='k', s=50)
plt.scatter(X[0, y[0]==1], X[1, y[0]==1],
c='red', label='Class 1', alpha=0.8, edgecolors='k', s=50)
plt.xlabel('Feature 1', fontsize=12)
plt.ylabel('Feature 2', fontsize=12)
plt.title(title, fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()
# Plot for training data
plot_decision_boundary(nn, X_train, y_train, "Training Data - Decision Boundary")
# Plot for test data
plot_decision_boundary(nn, X_test, y_test, "Test Data - Decision Boundary")
7. Experimenting with ArchitectureΒΆ
How Network Shape Affects LearningΒΆ
The choice of architecture β number of layers, number of neurons per layer, and activation functions β has a profound impact on what a network can learn and how efficiently it trains. A shallow network (few layers) may lack the representational power to capture complex patterns, while a deep network (many layers) has more capacity but is harder to train and more prone to overfitting. A wide network (many neurons in a single layer) can memorize patterns but may not generalize well. In practice, architecture design is guided by the complexity of the task, the amount of training data, and computational budget. The comparison below trains several architectures on the same dataset so you can observe these trade-offs firsthand.
def compare_architectures(X_train, y_train, X_test, y_test, architectures):
"""
Compare different neural network architectures
"""
results = []
for name, layer_sizes, activations in architectures:
print(f"\nTraining: {name}")
print(f" Architecture: {' β '.join(map(str, layer_sizes))}")
# Create and train network
nn = NeuralNetwork(layer_sizes, activations)
losses, _ = train_simple(nn, X_train, y_train, learning_rate=0.5, epochs=500)
# Evaluate
y_pred_test = nn.forward(X_test)
test_acc = accuracy(y_test, y_pred_test)
results.append({
'name': name,
'accuracy': test_acc,
'final_loss': losses[-1]
})
print(f" Test Accuracy: {test_acc*100:.2f}%")
return results
# Define architectures to compare
architectures = [
("Shallow (2β4β1)", [2, 4, 1], ['relu', 'sigmoid']),
("Medium (2β8β8β1)", [2, 8, 8, 1], ['relu', 'relu', 'sigmoid']),
("Deep (2β16β16β16β1)", [2, 16, 16, 16, 1], ['relu', 'relu', 'relu', 'sigmoid']),
("Wide (2β32β1)", [2, 32, 1], ['relu', 'sigmoid']),
]
print("="*60)
print("Comparing Neural Network Architectures")
print("="*60)
results = compare_architectures(X_train, y_train, X_test, y_test, architectures)
# Plot comparison
plt.figure(figsize=(10, 6))
names = [r['name'] for r in results]
accuracies = [r['accuracy'] * 100 for r in results]
bars = plt.bar(names, accuracies, color=['steelblue', 'darkgreen', 'crimson', 'orange'])
plt.ylabel('Test Accuracy (%)', fontsize=12)
plt.title('Architecture Comparison', fontsize=14, fontweight='bold')
plt.ylim([0, 100])
plt.xticks(rotation=15, ha='right')
plt.grid(axis='y', alpha=0.3)
# Add value labels on bars
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height,
f'{height:.1f}%',
ha='center', va='bottom', fontsize=10, fontweight='bold')
plt.tight_layout()
plt.show()
print("\n" + "="*60)
print("Key Insights:")
print("="*60)
print("1. Deeper isn't always better - depends on the problem")
print("2. More parameters = more capacity, but also risk of overfitting")
print("3. Architecture choice is problem-dependent")
print("4. Start simple, add complexity as needed")
SummaryΒΆ
β What You LearnedΒΆ
Neurons: Basic building blocks that compute weighted sums + activation
Activation Functions: Add non-linearity (ReLU, Sigmoid, Tanh)
Layers: Collections of neurons processing data
Neural Networks: Stacked layers that learn representations
Forward Propagation: How data flows through the network
Loss Functions: Measuring prediction error
Training: Iteratively improving weights
π Key ConceptsΒΆ
# The neural network formula
z = W @ x + b # Linear transformation
a = activation(z) # Non-linear activation
loss = -y*log(a) - (1-y)*log(1-a) # Binary cross-entropy
π― Whatβs Next?ΒΆ
Next notebook: 02_backpropagation_explained.ipynb
Youβll learn:
How neural networks actually learn (backpropagation)
Computing gradients with the chain rule
Implementing automatic differentiation
Optimization algorithms (SGD, Adam)
π Additional ResourcesΒΆ
Great job! Youβve built a neural network from scratch using only NumPy. In the next notebook, weβll make it learn properly using backpropagation! π