# Lab 1: Gradient Descent vs Stochastic Gradient Descent, and the Effects of Batch Size

``````#Generating a random dataset
# Numpy is a fundamental package for scientific computing. It contains an implementation of an array
# that we will use in this exercise.
import numpy as np
# We will be generating our own random dataset. As a consequence we need functionality to generate random numbers.
import random
# We will be plotting the progress of training using matplotlib, a package that can be used to generate 2D and 3D plots.
# We use the "widget" option to enable interactivity later on.
%matplotlib widget
import matplotlib.pyplot as plt
# We will use TensorFlow as the deep learning framework of choice for this class.
import tensorflow as tf

# Define the number of samples/data points you want to generate
n_samples = 100
# We will define a dataset that lies on a line as defined by y = w_gen * x + b_gen
w_gen = 10
b_gen = 2
# To make the problem a bit more interesting we will add some Gaussian noise as
# defined by the mean and standard deviation below.
mean_gen = 0
std_gen = 1

# This section generates the training dataset as defined by the variables in the section above.
x = np.random.uniform(0, 10, n_samples)
y = np.array([w_gen * (x + np.random.normal(loc=mean_gen, scale=std_gen, size=None)) + b_gen for x in x])

# Plot our randomly generated dataset
plt.close()
plt.plot(x, y, 'go')
plt.xlabel("x", size=24)
plt.ylabel("y", size=24)
plt.tick_params(axis='both', labelsize=16)
plt.tight_layout()
plt.show()

#Defining the model
# Create the placeholders for the data to be used.
X = tf.placeholder(tf.float32, name="X")
Y = tf.placeholder(tf.float32, name="Y")

# Create our model variables w (weights; this is intended to map to the slope, w_gen) and b (bias; this maps to the intercept, b_gen).
# For simplicity, we initialize the data to zero.
w = tf.Variable(0.0, name="weights")
b = tf.Variable(0.0, name="bias")

# Define our model. We are implementing a simple linear neuron as per the diagram shown above.
Y_predicted = w * X + b

#Defining the loss function
# We define the loss function which is an indicator of how good or bad our model is at any point of time.
loss = tf.reduce_mean(tf.squared_difference(Y_predicted, Y))

#Defining the optimization logic: gradient descent
# Define a gradient descent optimizer

#Training loop
# Define the maximum number of times we want to process the entire dataset (the number of epochs).
# In practice we won't run this many because we'll implement an early stopping condition that
# detects when the training process has converged.
max_number_of_epochs = 1000

# We still store information about the optimization process here.
loss_array = []
b_array = []
w_array = []

with tf.Session() as sess:

# Initialize the necessary variables
sess.run(tf.global_variables_initializer())

# Print out the parameters and loss before we do any training
w_value, b_value, loss_value = sess.run([w, b, loss], feed_dict={X: x, Y: y})
print("Before training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(w_value, b_value, loss_value))

print("")
print("Starting training")
print("")

# Start the training process
for i in range(max_number_of_epochs):

# Use the entire dataset to calculate the gradient and update the parameters
sess.run(optimizer, feed_dict={X: x, Y: y})

# Capture the data that we will use in our visualization
w_value, b_value, loss_value = sess.run([w, b, loss], feed_dict={X: x, Y: y})
w_array.append(w_value)
b_array.append(b_value)
loss_array.append(loss_value)

# At the end of every few epochs print out the learned weights
if (i + 1) % 5 == 0:
print("Epoch = {:2d}: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(i+1, w_value, b_value, loss_value))

# Implement your convergence check here, and exit the training loop if
# you detect that we are converged:
if (i >= 1) and (np.abs(loss_value - loss_array[-2]) / loss_array[-2] < 0.001): # TODO
break

print("")
print("Training finished after {} epochs".format(i+1))
print("")

print("After training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(w_value, b_value, loss_value))

plt.close()
plt.plot(loss_array)
plt.xlabel("Epoch", size=24)
plt.ylabel("Loss", size=24)
plt.tick_params(axis='both', labelsize=16)
plt.tight_layout()
plt.show()

#Investigating the progress of the loss function
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.gca(projection='3d')

ax.scatter(w_array, b_array, loss_array)

ax.set_xlabel('w', size=16)
ax.set_ylabel('b', size=16)
ax.tick_params(labelsize=12)

plt.show()

loss_surface = []
w_surface = []
b_surface = []

with tf.Session() as sess:

sess.run(tf.global_variables_initializer())

for w_value in np.linspace(0, 20, 200):
for b_value in np.linspace(-18, 22, 200):

# Collect information about the loss function surface
loss_value = sess.run(loss, feed_dict={X: x, Y: y, w: w_value, b: b_value})
b_surface.append(b_value)
w_surface.append(w_value)
loss_surface.append(loss_value)

plt.close()

fig = plt.figure()
ax2 = fig.gca(projection='3d')

ax2.scatter(w_surface, b_surface, loss_surface, c = loss_surface, alpha = 0.02)
ax2.plot(w_array, b_array, loss_array, color='black')

ax2.set_xlabel('w')
ax2.set_ylabel('b')

plt.show()
``````

``````# Define the maximum number of times we want to process the entire dataset (the number of epochs).
# In practice we won't run this many because we'll implement an early stopping condition that
# detects when the training process has converged.
max_number_of_epochs = 1000

# We still store information about the optimization process here.
loss_array = []
b_array = []
w_array = []

with tf.Session() as sess:

# Initialize the necessary variables
sess.run(tf.global_variables_initializer())

# Print out the parameters and loss before we do any training
w_value, b_value, loss_value = sess.run([w, b, loss], feed_dict={X: x, Y: y})
print("Before training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(w_value, b_value, loss_value))

print("")
print("Starting training")
print("")

# Start the training process
for i in range(max_number_of_epochs):

# Update after every data point
for (x_pt, y_pt) in zip(x, y):
sess.run(optimizer, feed_dict={X: x_pt, Y: y_pt})

# Capture the data that we will use in our visualization
# Note that we are now updating our loss function after
# every point in the sample, so the size of loss_array
# will be greater by a factor of n_samples compared to
# the last exercise.
w_value, b_value, loss_value = sess.run([w, b, loss], feed_dict={X: x_pt, Y: y_pt})
w_array.append(w_value)
b_array.append(b_value)
loss_array.append(loss_value)

# At the end of every few epochs print out the learned weights
if (i + 1) % 5 == 0:
avg_w = sum(w_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
avg_b = sum(b_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
avg_loss = sum(loss_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
print("Epoch = {:2d}: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(i+1, avg_w, avg_b, avg_loss))

# End the training when the loss function has not changed from the last epoch
# by more than a small amount. Note that in our convergence check we will compare
# the loss averaged over this epoch with the loss averaged over the last epoch.
if i > 1:
average_loss_this_epoch = sum(loss_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
average_loss_last_epoch = sum(loss_array[(i-2)*n_samples:(i-1)*n_samples]) / n_samples
if abs(average_loss_this_epoch - average_loss_last_epoch) / average_loss_last_epoch < 0.001:
break

print("")
print("Training finished after {} epochs".format(i+1))
print("")

avg_w = sum(w_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
avg_b = sum(b_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples
avg_loss = sum(loss_array[(i-1)*n_samples:(i  )*n_samples]) / n_samples

print("After training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(avg_w, avg_b, avg_loss))

plt.close()
plt.plot(loss_array)
plt.ylabel("Loss", size=24)
plt.tick_params(axis='both', labelsize=16)
plt.tight_layout()
plt.show()

from mpl_toolkits.mplot3d import Axes3D

plt.close()

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot(w_array, b_array, loss_array)

ax.set_xlabel('w', size=16)
ax.set_ylabel('b', size=16)
ax.tick_params(labelsize=12)

plt.show()

plt.close()

fig = plt.figure()
ax2 = fig.gca(projection='3d')

ax2.scatter(w_surface, b_surface, loss_surface, c = loss_surface, alpha = 0.02)
ax2.plot(w_array, b_array, loss_array, color='black')

ax2.set_xlabel('w')
ax2.set_ylabel('b')

plt.show()
``````

## Optimizing training with batch size

``````# Define the maximum number of times we want to process the entire dataset (the number of epochs).
# In practice we won't run this many because we'll implement an early stopping condition that
# detects when the training process has converged.
import math
max_number_of_epochs = 1000

# We still store information about the optimization process here.
loss_array = []
b_array = []
w_array = []

def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]

with tf.Session() as sess:

# Initialize the necessary variables
sess.run(tf.global_variables_initializer())

# Print out the parameters and loss before we do any training
w_value, b_value, loss_value = sess.run([w, b, loss], feed_dict={X: x, Y: y})
print("Before training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(w_value, b_value, loss_value))

print("")
print("Starting training")
print("")

# Pass in batches of the dataset
# After the first run, try batch sizes of 16, 64, and 128
batch_size = 128
num_batches_in_epoch = math.floor((n_samples + batch_size - 1) / batch_size) #FIXME

# Start the training process
for i in range(max_number_of_epochs):

for (x_batch, y_batch) in zip(list(chunks(x,batch_size)), list(chunks(y,batch_size))): #FIXME in FIXME:
#sess.run(optimizer, feed_dict={X: FIXME, Y: FIXME})
sess.run(optimizer, feed_dict={X: x_batch, Y: y_batch})

# Capture the data that we will use in our visualization
# These should be calculated only with the current batch
#w_value, b_value, loss_value = sess.run([w, b, loss], feed_dict={X: FIXME, Y: FIXME})
w_value, b_value, loss_value = sess.run([w, b, loss], feed_dict={X: x_batch, Y: y_batch})
w_array.append(w_value)
b_array.append(b_value)
loss_array.append(loss_value)

# At the end of every few epochs print out the learned weights
if (i + 1) % 5 == 0:
avg_w = sum(w_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
avg_b = sum(b_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
avg_loss = sum(loss_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
print("Epoch = {:2d}: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(i+1, avg_w, avg_b, avg_loss))

# End the training when the loss function has not changed from the last epoch
# by more than a small amount. Note that in our convergence check we will compare
# the loss averaged over this epoch with the loss averaged over the last epoch.
if i > 1:
average_loss_this_epoch = sum(loss_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
average_loss_last_epoch = sum(loss_array[(i-2)*num_batches_in_epoch:(i-1)*num_batches_in_epoch]) / num_batches_in_epoch
if abs(average_loss_this_epoch - average_loss_last_epoch) / average_loss_last_epoch < 0.001:
break

print("")
print("Training finished after {} epochs".format(i+1))
print("")

avg_w = sum(w_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
avg_b = sum(b_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch
avg_loss = sum(loss_array[(i-1)*num_batches_in_epoch:(i  )*num_batches_in_epoch]) / num_batches_in_epoch

print("After training: w = {:4.3f}, b = {:4.3f}, loss = {:7.3f}".format(avg_w, avg_b, avg_loss))

plt.close()
plt.plot(loss_array)
plt.ylabel("Loss", size=24)
plt.tick_params(axis='both', labelsize=16)
plt.tight_layout()
plt.show()

from mpl_toolkits.mplot3d import Axes3D

plt.close()

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot(w_array, b_array, loss_array)

ax.set_xlabel('w', size=16)
ax.set_ylabel('b', size=16)
ax.tick_params(labelsize=12)

plt.show()

plt.close()

fig = plt.figure()
ax2 = fig.gca(projection='3d')

ax2.scatter(w_surface, b_surface, loss_surface, c = loss_surface, alpha = 0.02)
ax2.plot(w_array, b_array, loss_array, color='black')

ax2.set_xlabel('w')
ax2.set_ylabel('b')

plt.show()
``````

## The Fashion-MNIST Dataset

### The Python Script

``````from __future__ import print_function

import argparse
import keras
from keras import backend as K
from keras.preprocessing import image
from keras.datasets import fashion_mnist
from keras_contrib.applications.wide_resnet import WideResidualNetwork
import numpy as np
import tensorflow as tf
import os
from time import time

# Parse input arguments

parser = argparse.ArgumentParser(description='Keras Fashion MNIST Example',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
help='tensorboard log directory')
help='input batch size for training')
help='input batch size for validation')
help='number of epochs to train')
help='learning rate for a single GPU')
help='weight decay')
# TODO Step 2: Add target and patience arguments to the argument parser
help='Target accuracy to stop training')
help='Number of epochs that meet target before stopping')

args = parser.parse_args()

# Define a function for a simple learning rate decay over time

def lr_schedule(epoch):

if epoch < 15:
return args.base_lr
if epoch < 25:
return 1e-1 * args.base_lr
if epoch < 35:
return 1e-2 * args.base_lr
return 1e-3 * args.base_lr

# Define the function that creates the model

def create_model():

# Set up standard WideResNet-16-10 model.
model = WideResidualNetwork(depth=16, width=10, weights=None, input_shape=input_shape,
classes=num_classes, dropout_rate=0.01)

# WideResNet model that is included with Keras is optimized for inference.
model_config = model.get_config()
for layer, layer_config in zip(model.layers, model_config['layers']):
if hasattr(layer, 'kernel_regularizer'):
regularizer = keras.regularizers.l2(args.wd)
layer_config['config']['kernel_regularizer'] = \
{'class_name': regularizer.__class__.__name__,
'config': regularizer.get_config()}
if type(layer) == keras.layers.BatchNormalization:
layer_config['config']['momentum'] = 0.9
layer_config['config']['epsilon'] = 1e-5

model = keras.models.Model.from_config(model_config)

opt = keras.optimizers.SGD(lr=args.base_lr)

model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=opt,
metrics=['accuracy'])

return model

verbose = 1

# Input image dimensions
img_rows, img_cols = 28, 28
num_classes = 10

(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

# Train only on 1/6 of the dataset
x_train = x_train[:10000,:,:]
y_train = y_train[:10000]

if K.image_data_format() == 'channels_first':
x_train = x_train.reshape(x_train.shape, 1, img_rows, img_cols)
x_test = x_test.reshape(x_test.shape, 1, img_rows, img_cols)
input_shape = (1, img_rows, img_cols)
else:
x_train = x_train.reshape(x_train.shape, img_rows, img_cols, 1)
x_test = x_test.reshape(x_test.shape, img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)

# Convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# Training data iterator.
train_gen = image.ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True,
horizontal_flip=True, width_shift_range=0.2, height_shift_range=0.2)
train_gen.fit(x_train)
train_iter = train_gen.flow(x_train, y_train, batch_size=args.batch_size)

# Validation data iterator.
test_gen = image.ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True)
test_gen.mean = train_gen.mean
test_gen.std = train_gen.std
test_iter = test_gen.flow(x_test, y_test, batch_size=args.val_batch_size)

# TODO Step 1: Define the PrintThroughput callback
class PrintThroughput(keras.callbacks.Callback):
def __init__(self, total_images=0):
self.total_images = total_images

def on_epoch_begin(self, epoch, logs=None):
self.epoch_start_time = time()

def on_epoch_end(self, epoch, logs={}):
epoch_time = time() - self.epoch_start_time
images_per_sec = round(self.total_images / epoch_time, 2)
print('Images/sec: {}'.format(images_per_sec))

# TODO Step 2: Define the StopAtAccuracy callback
class StopAtAccuracy(keras.callbacks.Callback):
def __init__(self, target=0.85, patience=2):
self.target = target
self.patience = patience
self.stopped_epoch = 0
self.met_target = 0

def on_epoch_end(self, epoch, logs=None):
if logs.get('val_acc') > self.target:
self.met_target += 1
else:
self.met_target = 0

if self.met_target >= self.patience:
self.stopped_epoch = epoch
self.model.stop_training = True

def on_train_end(self, logs=None):
if self.stopped_epoch > 0:
print('Early stopping after epoch {}'.format(self.stopped_epoch + 1))

# TODO Step 3: Define the PrintTotalTime callback
class PrintTotalTime(keras.callbacks.Callback):
def on_train_begin(self, logs=None):
self.start_time = time()

def on_epoch_end(self, epoch, logs=None):
total_time = round(time() - self.start_time, 2)
print("Cumulative training time after epoch {}: {}".format(epoch + 1, total_time))

def on_train_end(self, logs=None):
total_time = round(time() - self.start_time, 2)
print("Cumulative training time: {}".format(total_time))

callbacks = [PrintThroughput(total_images=len(y_train)),
StopAtAccuracy(target=args.target_accuracy, patience=args.patience),
PrintTotalTime()]
callbacks.append(keras.callbacks.LearningRateScheduler(lr_schedule))

# Create the model.

model = create_model()

# Train the model.
model.fit_generator(train_iter,
steps_per_epoch=len(train_iter),
callbacks=callbacks,
epochs=args.epochs,
verbose=verbose,
workers=4,
initial_epoch=0,
validation_data=test_iter,
validation_steps=len(test_iter))

# Evaluate the model on the full data set.
score = model.evaluate_generator(test_iter, len(test_iter), workers=4)
if verbose:
print('Test loss:', score)
print('Test accuracy:', score)
``````

### The Notebook

``````!python fashion_mnist.py
!cp fashion_mnist.py fashion_mnist_original.py
!python fashion_mnist.py --epochs 5
!python fashion_mnist.py --epochs 5 --batch-size 700
%matplotlib widget
import matplotlib.pyplot as plt

data = [('8', 328),
("16", 551),
("32", 808),
("64", 1002),
("128", 1165),
("256", 1273),
("512", 1329),
("700", 1332)] # See what happens when you go much above 700

x,y = zip(*data)
plt.bar(x,y)
plt.ylabel("Throughput (images / sec)")
plt.xlabel("Batch Size")
plt.show()
!python fashion_mnist.py --target-accuracy .82 --patience 2
!python fashion_mnist.py --batch-size 32 --target-accuracy 0.82 --patience 2
``````

Updated: