Periodically Save Trained Neural Network Models in PyTorch

Sybernix
5 min readNov 25, 2021
Periodically saving models. Source: https://towardsdatascience.com/ml-design-pattern-2-checkpoints-e6ca25a4c5fe

In my previous blogs, we saw how we can train models in PyTorch and how to plot the losses and error rates [1] [2].

We need to save our trained model so that we can use it for predictions whenever needed. We can choose to save only the model we get after our final epoch. However, due to overfitting, our best model could have been something that was produced before the final epoch. So it is better to store your model periodically so that later we can take a look at the loss and error graphs and pick a model that performs the best for our predictions.

Let’s first define a function to save the models.

def save_network(network, epoch_label):
save_filename = 'net_%s.pth' % epoch_label
save_path = os.path.join('./savedModels', save_filename)
torch.save(network.state_dict(), save_path)

This function will take our model and the epoch number as inputs and save the state dictionary of the model. Instead of saving the state dictionary, we can save the entire model as torch.save(model, PATH) but this will introduce some unexpected errors when we try to use the model on a different machine than the one we trained [3].

Note that torch.save(network.state_dict(), save_path) will save the model weights for the current device. If you are training on GPU and you want to save the model for use in CPU you can write torch.save(network.cpu().state_dict(), save_path) . But then we need to send the model back to the current device that you are using to train. The save_nerwork function will be as follows in this case.

def save_network(network, epoch_label):
save_filename = 'net_%s.pth' % epoch_label
save_path = os.path.join('./savedModels', save_filename)
torch.save(network.cpu().state_dict(), save_path)
if torch.cuda.is_available():
network.cuda()

Now, at the end of the validation stage of each epoch, we can call this function to persist the model. However, this might consume a lot of disk space. Saved models usually take up hundreds of MBs. So we will save the model for every 10 epoch as follows.

if phase == 'val':
last_model_wts = model.state_dict()
if epoch % 10 == 9:
save_network(model, epoch)

Note that this code is a continuation of my previous blogs [1] and [2]. You can read them to understand how to develop a deep learning model and train it.

Your entire code should look something like the following.

#!/usr/bin/python
# -*- encoding: utf-8 -*-

import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from simple_model import ft_net
import matplotlib.pyplot as plt

h, w = 256, 128
data_dir = '/home/niruhan/Personal/paper/Market-1501-v15.09.15/pytorch'
batchsize = 2
num_epochs = 10
use_gpu = torch.cuda.is_available()

transform_train_list = [
transforms.Resize((h, w), interpolation=3),
transforms.Pad(10),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]

transform_val_list = [
transforms.Resize(size=(h, w), interpolation=3), # Image.BICUBIC
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]

data_transforms = {
'train': transforms.Compose(transform_train_list),
'val': transforms.Compose(transform_val_list),
}

image_datasets = {}
image_datasets['train'] = datasets.ImageFolder(os.path.join(data_dir, 'train'),
data_transforms['train'])
image_datasets['val'] = datasets.ImageFolder(os.path.join(data_dir, 'val'),
data_transforms['val'])

dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batchsize, shuffle=True, num_workers=8)
for x in ['train', 'val']}

class_names = image_datasets['train'].classes
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}

model = ft_net(len(class_names))
criterion = nn.CrossEntropyLoss()

lr = 0.05
optim_name = optim.SGD
ignored_params = list(map(id, model.classifier.parameters()))
base_params = filter(lambda p: id(p) not in ignored_params, model.parameters())
classifier_params = model.classifier.parameters()
optimizer = optim_name([
{'params': base_params, 'lr': 0.1 * lr},
{'params': classifier_params, 'lr': lr}
], weight_decay=5e-4, momentum=0.9, nesterov=True)

y_loss = {'train': [], 'val': []} # loss history
y_err = {'train': [], 'val': []}

x_epoch = []
fig = plt.figure()
ax0 = fig.add_subplot(121, title="loss")
ax1 = fig.add_subplot(122, title="top1err")


def draw_curve(current_epoch):
x_epoch.append(current_epoch)
ax0.plot(x_epoch, y_loss['train'], 'bo-', label='train')
ax0.plot(x_epoch, y_loss['val'], 'ro-', label='val')
ax1.plot(x_epoch, y_err['train'], 'bo-', label='train')
ax1.plot(x_epoch, y_err['val'], 'ro-', label='val')
if current_epoch == 0:
ax0.legend()
ax1.legend()
fig.savefig(os.path.join('./lossGraphs', 'train.jpg'))


def save_network(network, epoch_label):
save_filename = 'net_%s.pth' % epoch_label
save_path = os.path.join('./savedModels', save_filename)
torch.save(network.cpu().state_dict(), save_path)
if torch.cuda.is_available():
network.cuda()


for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train(True) # Set model to training mode
else:
model.train(False) # Set model to evaluate mode

running_loss = 0.0
running_corrects = 0.0

count = 0
# Iterate over data.
for data in dataloaders[phase]:
if count > 10:
break

count = count + 1
# get a batch of inputs
inputs, labels = data
now_batch_size, c, h, w = inputs.shape
if now_batch_size < batchsize: # skip the last batch
continue
# print(inputs.shape)
# wrap them in Variable, if gpu is used, we transform the data to cuda.
if use_gpu:
inputs = Variable(inputs.cuda())
labels = Variable(labels.cuda())
else:
inputs, labels = Variable(inputs), Variable(labels)

# zero the parameter gradients
optimizer.zero_grad()

# -------- forward --------
outputs = model(inputs)
_, preds = torch.max(outputs.data, 1)
loss = criterion(outputs, labels)

del inputs

# -------- backward + optimize --------
# only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()

# statistics
running_loss += loss.item() * now_batch_size
del loss
running_corrects += float(torch.sum(preds == labels.data))

epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects / dataset_sizes[phase]

print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

y_loss[phase].append(epoch_loss)
y_err[phase].append(1.0 - epoch_acc)

# deep copy the model
if phase == 'val':
last_model_wts = model.state_dict()
if epoch % 10 == 9:
save_network(model, epoch)
draw_curve(epoch)

Don’t forget to create a directory named “savedModels” in your project path!

Now run the Python file. We have set the batch size as 2 and the number of epochs as 10. I have also added the following part to just process only 10 batches in each step.

count = 0
# Iterate over data.
for data in dataloaders[phase]:
if count > 10:
break

We can remove this conditional break while doing the actual training. I have done this to finish the execution quickly and check whether we have a saved model. After execution, you will see a model named “net_9.pth” saved in the “savedModels” directory.

In my future blogs, we will learn how to train the model in cloud instances.

References

[1] https://niruhan.medium.com/training-a-neural-network-in-pytorch-for-a-computer-vision-task-person-re-identification-b2b23d2cc8d0
[2] https://niruhan.medium.com/drawing-loss-curves-for-deep-neural-network-training-in-pytorch-ac617b24c388
[3] https://pytorch.org/tutorials/beginner/saving_loading_models.html

--

--