We will get acquainted with several logging options (Wandb, TensorBoard)
!pip install wandb --quiet
!wandb login
import random
import wandb
config = dict(
project_name = "Test wandb",
learning_rate = 0.02,
architecture = "CNN",
dataset = "CIFAR-100",
epochs = 10,
total_runs = 5
)
for run in range(config["total_runs"]):
# 🐝 1️⃣ Start a new run to track this script
wandb.init(
# Set the project where this run will be logged
project=config["project_name"],
# We pass a run name (otherwise it’ll be randomly assigned)
name=f"experiment_{run}",
# Track hyperparameters and run metadata
config=config
)
# This simple block simulates a training loop logging metrics
offset = random.random() / 5
for epoch in range(2, config["epochs"]):
acc = 1 - 2 ** -epoch - random.random() / epoch - offset
loss = 2 ** -epoch + random.random() / epoch + offset
# 🐝 2️⃣ Log metrics from your script to W&B
wandb.log({
"acc": acc,
"loss": loss
})
# Mark the run as finished
wandb.finish()
#@title
import wandb
import math
import random
import torch, torchvision
import torch.nn as nn
import torchvision.transforms as T
device = "cuda:0" if torch.cuda.is_available() else "cpu"
def get_dataloader(is_train, batch_size, slice=5):
"Get a training dataloader"
full_dataset = torchvision.datasets.FashionMNIST(
root=".",
train=is_train,
transform=T.ToTensor(),
download=True
)
sub_dataset = torch.utils.data.Subset(
full_dataset,
indices=range(0, len(full_dataset), slice)
)
loader = torch.utils.data.DataLoader(dataset=sub_dataset,
batch_size=batch_size,
shuffle=True if is_train else False,
pin_memory=True, num_workers=2)
return loader
def output_label(label):
output_mapping = {
0: "T-shirt/Top",
1: "Trouser",
2: "Pullover",
3: "Dress",
4: "Coat",
5: "Sandal",
6: "Shirt",
7: "Sneaker",
8: "Bag",
9: "Ankle Boot"
}
input = (label.item() if type(label) == torch.Tensor else label)
return output_mapping[input]
def get_model(dropout):
"A simple model"
model = nn.Sequential(
nn.Flatten(),
nn.Linear(28*28, 256),
nn.BatchNorm1d(256),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(256,10)
).to(device)
return model
def validate_model(model, valid_dl, loss_func, log_images=False, batch_idx=0):
"Compute performance of the model on the validation data, log a wandb.Table"
model.eval()
val_loss = 0.
with torch.inference_mode():
correct = 0
for i, (images, labels) in enumerate(valid_dl):
images, labels = images.to(device), labels.to(device)
# Forward pass ➡
outputs = model(images)
val_loss += loss_func(outputs, labels)*labels.size(0)
# Compute accuracy and accumulate
_, predicted = torch.max(outputs.data, 1)
correct += (predicted == labels).sum().item()
# Log one batch of images to the dashboard, always same batch_idx.
if i==batch_idx and log_images:
log_image_table(images, predicted, labels,
outputs.softmax(dim=1))
return val_loss / len(valid_dl.dataset), correct / len(valid_dl.dataset)
def log_image_table(images, predicted, labels, probs):
"Log a wandb.Table with (img, pred, target, scores)"
# 🐝 Create a wandb Table to log images, labels and predictions to
table = wandb.Table(columns=["image", "pred", "target"]+
[f"score_{i}" for i in range(10)])
for img, pred, targ, prob in zip(images.to("cpu"),
predicted.to("cpu"),
labels.to("cpu"),
probs.to("cpu")):
table.add_data(wandb.Image(img[0].numpy()*255),
output_label(pred),
output_label(targ),
*prob.numpy())
wandb.log({"predictions_table":table}, commit=False)
# Launch 5 experiments, trying different dropout rates
for _ in range(5):
# 🐝 initialise a wandb run
wandb.init(
project="pytorch-intro",
config={
"epochs": 10,
"batch_size": 128,
"lr": 1e-3,
"dropout": random.uniform(0.01, 0.80)
})
# Copy your config
config = wandb.config
# Get the data
train_dl = get_dataloader(is_train=True, batch_size=config.batch_size)
valid_dl = get_dataloader(is_train=False, batch_size=2*config.batch_size)
n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)
# A simple MLP model
model = get_model(config.dropout)
# Make the loss and optimizer
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
# Training
example_ct = 0
step_ct = 0
for epoch in range(config.epochs):
model.train()
for step, (images, labels) in enumerate(train_dl):
images, labels = images.to(device), labels.to(device)
outputs = model(images)
train_loss = loss_func(outputs, labels)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
example_ct += len(images)
metrics = {"train/train_loss": train_loss,
"train/epoch": (step +
1 + (n_steps_per_epoch * epoch)) / n_steps_per_epoch,
"train/example_ct": example_ct}
if step + 1 < n_steps_per_epoch:
# 🐝 Log train metrics to wandb
wandb.log(metrics)
step_ct += 1
val_loss, accuracy = validate_model(
model,
valid_dl,
loss_func,
log_images=(epoch==(config.epochs-1))
)
# 🐝 Log train and validation metrics to wandb
val_metrics = {"val/val_loss": val_loss,
"val/val_accuracy": accuracy}
wandb.log({**metrics, **val_metrics})
print(f"🤖 Train Loss: {train_loss:.3f}, Valid Loss: {val_loss:3f}, "
f"Accuracy: {accuracy:.2f}")
# If you had a test set, this is how you could log it as a Summary metric
wandb.summary['test_accuracy'] = 0.8
# 🐝 Close your wandb run
wandb.finish()
Here we can plot the graph for the generalization error, play with the legend.
Make two runs with the same dropout rate. Make conclusion.
import numpy as np
def fix_seed(seed=42):
# torch.use_deterministic_algorithms(True)
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
methods = ["SGD", "Adam"]
N_runs = 3
for method in methods:
# Launch N_runs experiments, trying different dropout rates
for _ in range(N_runs):
fix_seed(_)
# 🐝 initialise a wandb run
wandb.init(
# reinit=True,
project="pytorch-intro",
config={
"epochs": 10,
"batch_size": 128,
"dropout": 0.1,
"method": method
})
# Copy your config
config = wandb.config
# Get the data
train_dl = get_dataloader(is_train=True, batch_size=config.batch_size)
valid_dl = get_dataloader(is_train=False, batch_size=2*config.batch_size)
n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)
# A simple MLP model
model = get_model(config.dropout)
# Make the loss and optimizer
loss_func = nn.CrossEntropyLoss()
if method == "Adam":
config.lr = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
elif method == "SGD":
config.lr = 5e-2
optimizer = torch.optim.SGD(model.parameters(), lr=config.lr)
# Training
example_ct = 0
step_ct = 0
for epoch in range(config.epochs):
model.train()
for step, (images, labels) in enumerate(train_dl):
images, labels = images.to(device), labels.to(device)
outputs = model(images)
train_loss = loss_func(outputs, labels)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
example_ct += len(images)
metrics = {"train/train_loss": train_loss,
"train/epoch": (step +
1 + (n_steps_per_epoch * epoch)) / n_steps_per_epoch,
"train/example_ct": example_ct}
if step + 1 < n_steps_per_epoch:
# 🐝 Log train metrics to wandb
wandb.log(metrics)
step_ct += 1
val_loss, accuracy = validate_model(
model,
valid_dl,
loss_func,
log_images=(epoch==(config.epochs-1))
)
# 🐝 Log train and validation metrics to wandb
val_metrics = {"val/val_loss": val_loss,
"val/val_accuracy": accuracy}
wandb.log({**metrics, **val_metrics})
print(f"Train Loss: {train_loss:.3f}, Valid Loss: {val_loss:3f}, "
f"Accuracy: {accuracy:.2f}")
# If you had a test set, this is how you could log it as a Summary metric
wandb.summary['test_accuracy'] = 0.8
# 🐝 Close your wandb run
wandb.finish()
import wandb
import pickle
def save_obj(obj, name ):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name ):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)
import wandb
wandb.login()
entity = "skoltech_optimization" # team name
project = 'Test wandb' # mtl_mcifar10, mtl_mcifar10_adaptive
datafile = 'wandb_data'
logged_metrics = ["acc",
"loss"]
api = wandb.Api()
runs = api.runs(f"{entity}/{project}")
# try:
# global_history_information = load_obj(datafile)
# except TypeError:
# global_history_information = {}
# else:
# print(f'Updating current object')
global_history_information = {}
for run in runs:
# Avoid obsolete API requests
if run.id in global_history_information.keys():
continue
global_history_information[run.id] = {}
global_history_information[run.id]['name'] = run.name
for metric in logged_metrics:
global_history_information[run.id][metric] = []
print(f'🤖 Requesting metrics for {project}. {run.notes}')
for dict_item in run.scan_history():
for metric in logged_metrics:
if metric in dict_item.keys():
global_history_information[run.id][metric].append(dict_item[metric])
save_obj(global_history_information, datafile)
for run_id in global_history_information:
print(global_history_information[run_id])
# Load the TensorBoard notebook extension
%load_ext tensorboard
The tensorboard extension is already loaded. To reload it, use: %reload_ext tensorboard
import torch
from torch.utils.tensorboard import SummaryWriter
log_dir = "logs"
writer = SummaryWriter(log_dir)
x = torch.arange(-5, 5, 0.1).view(-1, 1)
y = -5 * x + 0.1 * torch.randn(x.size())
model = torch.nn.Linear(1, 1)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)
def train_model(iter):
for epoch in range(iter):
y1 = model(x)
loss = criterion(y1, y)
writer.add_scalar("Loss/train", loss, epoch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_model(10)
writer.flush()
writer.close()
%tensorboard --logdir=logs
Useful links and sources