Here we compare the accuracy and computation time of the training of simple fully-connected neural networks using numpy and pytorch implementations and applied to the MNIST data set. The Adam optimization algorithm in numpy and pytorch are compared, as well as the Scaled Conjugate Gradient optimization algorithm in numpy.
The original notebook is available here.
Additional comments and explanations will be added shortly. If you have suggestions or corrections, please write to chuck.anderson@colostate.edu.
In [ ]:
!nvidia-smi
In [ ]:
import numpy as np
import pickle
import gzip
import json
import time
import sys
import subprocess
# For my numpy neural network implementation
import neuralnetworks as nn
# for pytorch
import os
import torch
import torch.nn as tnn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
# for reading and plotting results
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
######################################################################
## Read mnist data into dataTrain
with gzip.open('mnist.pkl.gz', 'rb') as f:
train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
Xtrain = np.array(train_set[0])
Xval = np.array(valid_set[0])
Xtest = np.array(test_set[0])
Ttrain = np.array(train_set[1]).reshape((-1, 1))
Tval = np.array(valid_set[1]).reshape((-1, 1))
Ttest = np.array(test_set[1]).reshape((-1, 1))
# to match with main-gpu.py in res/pytorch...01../feed.../main-gpu.py
Xtrain = np.vstack((Xtrain, Xval))
Ttrain = np.vstack((Ttrain, Tval))
dataTrain = np.hstack((Xtrain, Ttrain)) # so we can shuffle the order
dataTrain = dataTrain.astype(np.float32)
nSamples = dataTrain.shape[0]
In [ ]:
######################################################################
## Write results
def writeResults(filename, timeAcc, label):
if filename == 'stdout':
f = sys.stdout
else:
f = open(filename, 'a')
f.write(label+'\n')
f.write(str(len(timeAcc)) + '\n')
for ta in timeAcc:
f.write('{:.2f} {:.3f}\n'.format(ta[0], ta[1]))
if filename != 'stdout':
f.close()
In [ ]:
def runnumpy(batchSize=None, numEpochs=10, hidden=[100], nIterations=100, useRelu=False, useAdam=False):
label = 'Numpy '
label += 'Adam' if useAdam else 'SCG'
label += ' batch {} epochs {:d} hids {} nIter {:d}'.format(batchSize, numEpochs, hidden, nIterations)
label += ' ReLU ' if useRelu else ' Tanh '
label += time.strftime('%m/%d/17-%H:%M')
Xtrain = dataTrain[:,:-1]
Ttrain = dataTrain[:,-1:]
nnet = nn.NeuralNetworkClassifier([Xtrain.shape[1]] + hidden + [10],
np.arange(10), useRelu=useRelu)
# NOT STANDARDIZING THE INPUTS!!!
nnet.setStandardize(False)
secsAcc = []
# numEpochs determines number of breaks during training to calculate test error
if batchSize is None:
startTime = time.time()
for i in range(numEpochs):
Xtrain = dataTrain[:, :-1]
Ttrain = dataTrain[:, -1:]
nnet.train(Xtrain, Ttrain, nIterations=nIterations, verbose=False)
ptest = nnet.use(Xtest)
secsAcc.append([time.time() - startTime, np.mean(ptest!=Ttest)])
else: # numpyg on batches
nSamples = dataTrain.shape[0]
if nSamples % batchSize != 0:
print('WARNING: nSamples {} is not divisible by batchSize {}'.format(
nSampmles, batchSize))
nBatches = nSamples // batchSize
startTime = time.time()
for epoch in range(numEpochs):
np.random.shuffle(dataTrain)
for traini in range(0, nSamples, batchSize):
Xtrain = dataTrain[traini:traini+batchSize,:-1]
Ttrain = dataTrain[traini:traini+batchSize,-1:]
nnet.train(Xtrain, Ttrain, restart=True, nIterations=nIterations, useAdam=useAdam)
ptest = nnet.use(Xtest)
secsAcc.append([time.time() - startTime, np.mean(ptest!=Ttest)])
writeResults(resultsFilename, secsAcc, label)
if numEpochs <= 10:
writeResults('stdout', secsAcc, label)
In [ ]:
def runpytorch(batchSize=100, numEpochs=10, hidden=[100],
learningRate=0.001, nIterations=100, useRelu=True, useGPU=False):
if useGPU:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print('torch.cuda.is_available() is', torch.cuda.is_available())
if not torch.cuda.is_available():
print('GPU is not available. Not running sgd pytorch cpu')
return
label = 'Pytorch '
if useGPU:
label += 'GPU '
label += 'Adam batch {} epochs {:d} lr {:.6f} hids {} nIter {:d}'.format(batchSize, numEpochs, learningRate, hidden, nIterations)
label += ' ReLU ' if useRelu else ' Tanh '
label += time.strftime('%m/%d/17-%H:%M')
# Neural Network Model (1 hidden layer)
class Net(tnn.Module):
def __init__(self, input_size, hidden_size, num_classes):
self.hidden_size = hidden_size
super(Net, self).__init__()
self.fc1 = tnn.Linear(input_size, hidden_size[0])
self.relu = tnn.ReLU() if useRelu else tnn.Tanh()
if len(hidden_size) > 1:
self.fc2 = tnn.Linear(hidden_size[0], hidden_size[1])
self.relu2 = tnn.ReLU() if useRelu else tnn.Tanh()
self.fc3 = tnn.Linear(hidden_size[1], num_classes)
else:
self.fc3 = tnn.Linear(hidden_size[0], num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
if len(self.hidden_size) > 1:
out = self.fc2(out)
out = self.relu2(out)
out = self.fc3(out)
return out
train_dataset = dsets.MNIST(root='./data',
train=True,
transform=transforms.ToTensor(),
download=True)
test_dataset = dsets.MNIST(root='./data',
train=False,
transform=transforms.ToTensor())
# Data Loader (Input Pipeline)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batchSize,
shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batchSize,
shuffle=False)
num_classes = 10
net = Net(784, hidden, num_classes)
if useGPU:
net.cuda()
# Loss and Optimizer
criterion = tnn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learningRate)
global dataTrain, nSamples
if nSamples % batchSize != 0:
print('WARNING: nSamples {} is not divisible by batchSize {}'.format(
nSampmles, batchSize))
nBatches = nSamples // batchSize
secsAcc = []
startTime = time.time()
for epoch in range(numEpochs):
np.random.shuffle(dataTrain)
for i, (images, labels) in enumerate(train_loader):
# Forward + Backward + Optimize
if useGPU:
images = Variable(images.view(-1, 28*28)).cuda()
labels = Variable(labels).cuda()
else:
images = Variable(images.view(-1, 28*28))
labels = Variable(labels)
# Forward + Backward + Optimize
for iter in range(nIterations):
optimizer.zero_grad() # zero the gradient buffer
outputs = net(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
correct = 0
total = 0
for images, labels in test_loader:
if useGPU:
images = Variable(images.view(-1, 28*28)).cuda()
else:
images = Variable(images.view(-1, 28*28))
outputs = net(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
if useGPU:
correct += (predicted.cpu() == labels).sum()
else:
correct += (predicted == labels).sum()
secsAcc.append([time.time() - startTime, (total-correct) / total])
writeResults(resultsFilename, secsAcc, label)
if numEpochs <= 10:
writeResults('stdout', secsAcc, label)
In [ ]:
resultsFilename = 'test.results'
subprocess.call(['rm', resultsFilename])
hidden = [500, 500] # can contain one or two ints, for one or two hidden layers
batchSize = 100
numEpochs = 50
In [ ]:
runpytorch(batchSize=batchSize, numEpochs=numEpochs, hidden=hidden,
learningRate=0.001, nIterations=1, useRelu=False, useGPU=True)
In [ ]:
runpytorch(batchSize=batchSize, numEpochs=numEpochs, hidden=hidden,
learningRate=0.001, nIterations=1, useRelu=True, useGPU=False)
In [ ]:
runpytorch(batchSize=batchSize, numEpochs=numEpochs, hidden=hidden,
learningRate=0.001, nIterations=1, useRelu=False, useGPU=False)
In [ ]:
runpytorch(batchSize=batchSize, numEpochs=numEpochs, hidden=hidden,
learningRate=0.001, nIterations=1, useRelu=True, useGPU=False)
In [ ]:
def plotFromFile(filename='test.results'):
results = {}
with open(filename,'r') as f:
while True:
label = f.readline()
if label is None or label == '':
break;
n = int(f.readline())
secsAcc = []
for i in range(n):
secsAcc.append([float(s) for s in f.readline().split(' ')])
results[label] = secsAcc
markers = ['s','8','>','^','<','v','o','X','P','d','h','*','p','D','H']
mi = 0
print(sorted(results))
for key in sorted(results):
value = results[key]
value = np.array(value)
plt.plot(value[:, 0], value[:, 1], '-',
marker=markers[mi], label=key, lw=4,
markersize=15)
mi = (mi + 1) % len(markers)
plt.xlabel('Seconds')
plt.ylabel('Fraction of test samples incorrectly classified')
plt.legend();
In [ ]:
# cat test.results
In [ ]:
plt.figure(figsize=(20, 12))
plotFromFile('test.results')
In [ ]:
runnumpy(batchSize=batchSize, numEpochs=numEpochs, hidden=hidden, nIterations=1,
useRelu=False, useAdam=True)
runnumpy(batchSize=batchSize, numEpochs=numEpochs, hidden=hidden, nIterations=1,
useRelu=False, useAdam=False)
runnumpy(batchSize=batchSize, numEpochs=numEpochs, hidden=hidden, nIterations=1,
useRelu=True, useAdam=True)
runnumpy(batchSize=batchSize, numEpochs=numEpochs, hidden=hidden, nIterations=1,
useRelu=True, useAdam=False)
In [ ]:
runnumpy(batchSize=None, numEpochs=50, hidden=hidden, nIterations=10,
useRelu=False, useAdam=True)
runnumpy(batchSize=None, numEpochs=50, hidden=hidden, nIterations=10,
useRelu=False, useAdam=False)
runnumpy(batchSize=None, numEpochs=50, hidden=hidden, nIterations=10,
useRelu=True, useAdam=True)
runnumpy(batchSize=None, numEpochs=50, hidden=hidden, nIterations=10,
useRelu=True, useAdam=False)
In [ ]:
plt.figure(figsize=(20, 12))
plotFromFile('test.results')
In [ ]:
plt.figure(figsize=(20, 12))
plotFromFile('test.results')
plt.ylim(0.01,0.125)
# plt.xlim(0,40)
Out[ ]:
In [ ]:
plt.figure(figsize=(20, 12))
plotFromFile('test.results')
plt.ylim(0.01,0.04)
# plt.xlim(0,40)
Out[ ]:
In [ ]: