-- Living Mobile --: How to Perform Deep learning with MLFlow ?

pip install mlflow torch torchvision

Step 1: Create a new experiment

Create a new MLflow experiment for the tutorial and enable system metrics monitoring. Here we set the monitoring interval to 1 second because the training will be quick, but for longer training runs, you can set it to a larger value.

python

import mlflow

# The set_experiment API creates a new experiment if it doesn't exist.

mlflow.set_experiment("Deep Learning Experiment")

# IMPORTANT: Enable system metrics monitoring

mlflow.config.enable_system_metrics_logging()

mlflow.config.set_system_metrics_sampling_interval(1)

Step 2: Prepare the dataset

In this example, we will use the FashionMNIST dataset, which is a collection of 28x28 grayscale images of 10 different types of clothing.

python

import torch

import torch.nn as nn

import torch.optim as optim

from torch.utils.data import DataLoader

from torchvision import datasets, transforms

# Define device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and prepare data

transform = transforms.Compose(

[transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]

)

train_dataset = datasets.FashionMNIST(

"data", train=True, download=True, transform=transform

)

test_dataset = datasets.FashionMNIST("data", train=False, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

test_loader = DataLoader(test_dataset, batch_size=1000)

Step 3: Define the model and optimizer

Define a simple MLP model with 2 hidden layers.

python

import torch.nn as nn

class NeuralNetwork(nn.Module):

def __init__(self):

super().__init__()

self.flatten = nn.Flatten()

self.linear_relu_stack = nn.Sequential(

nn.Linear(28 * 28, 512),

nn.ReLU(),

nn.Linear(512, 512),

nn.ReLU(),

nn.Linear(512, 10),

)

def forward(self, x):

x = self.flatten(x)

logits = self.linear_relu_stack(x)

return logits

model = NeuralNetwork().to(device)

Then, define the training parameters and optimizer.

python

# Training parameters

params = {

"epochs": 5,

"learning_rate": 1e-3,

"batch_size": 64,

"optimizer": "SGD",

"model_type": "MLP",

"hidden_units": [512, 512],

}

# Define optimizer and loss function

loss_fn = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(), lr=params["learning_rate"])

Step 4: Train the model

Now we are ready to train the model. Inside the training loop, we log the metrics and checkpoints to MLflow. The key points in this code are:

Initiate an MLflow run context to start a new run that we will log the model and metadata to.

Log training parameters using mlflow.log_params.

Log various metrics using mlflow.log_metrics.

Save checkpoints for each epoch using mlflow.pytorch.log_model.

python

with mlflow.start_run() as run:

# Log training parameters

mlflow.log_params(params)

for epoch in range(params["epochs"]):

model.train()

train_loss = correct, total = 0, 0, 0

for batch_idx, (data, target) in enumerate(train_loader):

data, target = data.to(device), target.to(device)

# Forward pass

optimizer.zero_grad()

output = model(data)

loss = loss_fn(output, target)

# Backward pass

loss.backward()

optimizer.step()

# Calculate metrics

train_loss += loss.item()

_, predicted = output.max(1)

total += target.size(0)

correct += predicted.eq(target).sum().item()

# Log batch metrics (every 100 batches)

if batch_idx % 100 == 0:

batch_loss = train_loss / (batch_idx + 1)

batch_acc = 100.0 * correct / total

mlflow.log_metrics(

{"batch_loss": batch_loss, "batch_accuracy": batch_acc},

step=epoch * len(train_loader) + batch_idx,

)

# Calculate epoch metrics

epoch_loss = train_loss / len(train_loader)

epoch_acc = 100.0 * correct / total

# Validation

model.eval()

val_loss, val_correct, val_total = 0, 0, 0

with torch.no_grad():

for data, target in test_loader:

data, target = data.to(device), target.to(device)

output = model(data)

loss = loss_fn(output, target)

val_loss += loss.item()

_, predicted = output.max(1)

val_total += target.size(0)

val_correct += predicted.eq(target).sum().item()

# Calculate and log epoch validation metrics

val_loss = val_loss / len(test_loader)

val_acc = 100.0 * val_correct / val_total

# Log epoch metrics

mlflow.log_metrics(

{

"train_loss": epoch_loss,

"train_accuracy": epoch_acc,

"val_loss": val_loss,

"val_accuracy": val_acc,

step=epoch,

)

# Log checkpoint at the end of each epoch

mlflow.pytorch.log_model(model, name=f"checkpoint_{epoch}")

print(

f"Epoch {epoch+1}/{params['epochs']}, "

f"Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}%, "

f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%"

)

# Log the final trained model

model_info = mlflow.pytorch.log_model(model, name="final_model")

Now view the results in MFLow UI

mlflow ui --port 5000

-- Living Mobile --

Sunday, November 30, 2025

How to Perform Deep learning with MLFlow ?

No comments:

Post a Comment

Followers

Blog Archive

About Me