Advanced Methods for Improving the Performance of Convolutional Neural Networks

CSC/DSC 340 Week 11 Slides

Author: Dr. Julie Butler

Date Created: October 29, 2023

Last Modified: October 29, 2023

import tensorflow as tf

# Load MNIST data
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the data
train_images, test_images = train_images / 255.0, test_images / 255.0

# Print the shape of the data
print("Train images shape:", train_images.shape)
print("Train labels shape:", train_labels.shape)
print("Test images shape:", test_images.shape)
print("Test labels shape:", test_labels.shape)
Train images shape: (60000, 28, 28)
Train labels shape: (60000,)
Test images shape: (10000, 28, 28)
Test labels shape: (10000,)
import matplotlib.pyplot as plt
# Display a small number of images
num_images = 5
plt.figure(figsize=(10, 3))
for i in range(num_images):
    plt.subplot(1, num_images, i + 1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
    plt.xlabel(train_labels[i])
plt.show()

from tensorflow.keras import layers, models

# Build the model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(train_images, train_labels, epochs=5, batch_size=64, validation_split=0.1)

# Evaluate the model
test_loss, test_acc = model.evaluate(test_images, test_labels, verbose=2)
print('\nTest accuracy:', test_acc)

del train_images
del test_images
Epoch 1/5
844/844 [==============================] - 17s 20ms/step - loss: 0.2010 - accuracy: 0.9379 - val_loss: 0.0659 - val_accuracy: 0.9803
Epoch 2/5
844/844 [==============================] - 21s 25ms/step - loss: 0.0547 - accuracy: 0.9829 - val_loss: 0.0395 - val_accuracy: 0.9885
Epoch 3/5
844/844 [==============================] - 18s 21ms/step - loss: 0.0379 - accuracy: 0.9881 - val_loss: 0.0373 - val_accuracy: 0.9898
Epoch 4/5
844/844 [==============================] - 18s 22ms/step - loss: 0.0291 - accuracy: 0.9907 - val_loss: 0.0352 - val_accuracy: 0.9908
Epoch 5/5
844/844 [==============================] - 21s 25ms/step - loss: 0.0233 - accuracy: 0.9922 - val_loss: 0.0379 - val_accuracy: 0.9893
313/313 - 1s - loss: 0.0373 - accuracy: 0.9868 - 1s/epoch - 4ms/step

Test accuracy: 0.9868000149726868

Improving Through Pre-Processing Steps

  • First let’s increase the size of the data set using data augmentation
    • Add images to the data set that are rotated, stretched, translated, and zoomed in versions of the original images
    • Should lead to better generalization and thus increased accuracy
  • Note that for the ImageDataGenerator function the X data (the images) need to be four dimensions instead of 3 dimensions
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import scipy.ndimage

# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Data preprocessing and normalization
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1).astype('float32') / 255
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1).astype('float32') / 255

# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1
)
datagen.fit(x_train)

# Create a simple CNN model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Fit the model with augmented data
model.fit(datagen.flow(x_train, y_train, batch_size=32),
          steps_per_epoch=len(x_train) / 32, epochs=5)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test accuracy: {test_acc}')

del x_train
del x_test
Epoch 1/5
1875/1875 [==============================] - 29s 15ms/step - loss: 0.2803 - accuracy: 0.9118
Epoch 2/5
1207/1875 [==================>...........] - ETA: 9s - loss: 0.1032 - accuracy: 0.9681
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In [4], line 38
     33 model.compile(optimizer='adam',
     34               loss='sparse_categorical_crossentropy',
     35               metrics=['accuracy'])
     37 # Fit the model with augmented data
---> 38 model.fit(datagen.flow(x_train, y_train, batch_size=32),
     39           steps_per_epoch=len(x_train) / 32, epochs=5)
     41 # Evaluate the model
     42 test_loss, test_acc = model.evaluate(x_test, y_test)

File ~/Library/Python/3.9/lib/python/site-packages/keras/src/utils/traceback_utils.py:65, in filter_traceback.<locals>.error_handler(*args, **kwargs)
     63 filtered_tb = None
     64 try:
---> 65     return fn(*args, **kwargs)
     66 except Exception as e:
     67     filtered_tb = _process_traceback_frames(e.__traceback__)

File ~/Library/Python/3.9/lib/python/site-packages/keras/src/engine/training.py:1783, in Model.fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1775 with tf.profiler.experimental.Trace(
   1776     "train",
   1777     epoch_num=epoch,
   (...)
   1780     _r=1,
   1781 ):
   1782     callbacks.on_train_batch_begin(step)
-> 1783     tmp_logs = self.train_function(iterator)
   1784     if data_handler.should_sync:
   1785         context.async_wait()

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/util/traceback_utils.py:150, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    148 filtered_tb = None
    149 try:
--> 150   return fn(*args, **kwargs)
    151 except Exception as e:
    152   filtered_tb = _process_traceback_frames(e.__traceback__)

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:831, in Function.__call__(self, *args, **kwds)
    828 compiler = "xla" if self._jit_compile else "nonXla"
    830 with OptionalXlaContext(self._jit_compile):
--> 831   result = self._call(*args, **kwds)
    833 new_tracing_count = self.experimental_get_tracing_count()
    834 without_tracing = (tracing_count == new_tracing_count)

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:867, in Function._call(self, *args, **kwds)
    864   self._lock.release()
    865   # In this case we have created variables on the first call, so we run the
    866   # defunned version which is guaranteed to never create variables.
--> 867   return tracing_compilation.call_function(
    868       args, kwds, self._no_variable_creation_config
    869   )
    870 elif self._variable_creation_config is not None:
    871   # Release the lock early so that multiple threads can perform the call
    872   # in parallel.
    873   self._lock.release()

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/tracing_compilation.py:139, in call_function(args, kwargs, tracing_options)
    137 bound_args = function.function_type.bind(*args, **kwargs)
    138 flat_inputs = function.function_type.unpack_inputs(bound_args)
--> 139 return function._call_flat(  # pylint: disable=protected-access
    140     flat_inputs, captured_inputs=function.captured_inputs
    141 )

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/concrete_function.py:1264, in ConcreteFunction._call_flat(self, tensor_inputs, captured_inputs)
   1260 possible_gradient_type = gradients_util.PossibleTapeGradientTypes(args)
   1261 if (possible_gradient_type == gradients_util.POSSIBLE_GRADIENT_TYPES_NONE
   1262     and executing_eagerly):
   1263   # No tape is watching; skip to running the function.
-> 1264   return self._inference_function.flat_call(args)
   1265 forward_backward = self._select_forward_and_backward_functions(
   1266     args,
   1267     possible_gradient_type,
   1268     executing_eagerly)
   1269 forward_function, args_with_tangents = forward_backward.forward()

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py:217, in AtomicFunction.flat_call(self, args)
    215 def flat_call(self, args: Sequence[core.Tensor]) -> Any:
    216   """Calls with tensor inputs and returns the structured output."""
--> 217   flat_outputs = self(*args)
    218   return self.function_type.pack_output(flat_outputs)

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py:252, in AtomicFunction.__call__(self, *args)
    250 with record.stop_recording():
    251   if self._bound_context.executing_eagerly():
--> 252     outputs = self._bound_context.call_function(
    253         self.name,
    254         list(args),
    255         len(self.function_type.flat_outputs),
    256     )
    257   else:
    258     outputs = make_call_op_in_graph(
    259         self,
    260         list(args),
    261         self._bound_context.function_call_options.as_attrs(),
    262     )

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/context.py:1479, in Context.call_function(self, name, tensor_inputs, num_outputs)
   1477 cancellation_context = cancellation.context()
   1478 if cancellation_context is None:
-> 1479   outputs = execute.execute(
   1480       name.decode("utf-8"),
   1481       num_outputs=num_outputs,
   1482       inputs=tensor_inputs,
   1483       attrs=attrs,
   1484       ctx=self,
   1485   )
   1486 else:
   1487   outputs = execute.execute_with_cancellation(
   1488       name.decode("utf-8"),
   1489       num_outputs=num_outputs,
   (...)
   1493       cancellation_manager=cancellation_context,
   1494   )

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/execute.py:60, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     53   # Convert any objects of type core_types.Tensor to Tensor.
     54   inputs = [
     55       tensor_conversion_registry.convert(t)
     56       if isinstance(t, core_types.Tensor)
     57       else t
     58       for t in inputs
     59   ]
---> 60   tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
     61                                       inputs, attrs, num_outputs)
     62 except core._NotOkStatusException as e:
     63   if name is not None:

KeyboardInterrupt: 
  • Next we will attempt to increase the accuracy by sharpening the edges of the digits by removing pixels that are light grey
  • For more complex images, there are algorithms for edge sharpening, deblurring, etc. that can be applied in a similar manner
import numpy as np
# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Function to remove gray images
def remove_gray_images(images, threshold=5):
    images_redone = []
    for image in images:
        for i in range(28):
            for j in range(28):
                if image[i][j] < 255-threshold:
                    image[i][j] = 0
        images_redone.append(image)
    images_redone = np.asarray(images_redone)
    return images_redone

# Remove gray images from the training set
x_train_filtered = remove_gray_images(x_train, 55)

# Remove gray images from the test set
x_test_filtered = remove_gray_images(x_test, 55)
  • The original images
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# Display a small number of images
num_images = 5
plt.figure(figsize=(10, 3))
for i in range(num_images):
    plt.subplot(1, num_images, i + 1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(x_train[i], cmap=plt.cm.binary)
    plt.xlabel(y_train[i])
plt.show()
  • The sharpened images
# Display a small number of images
num_images = 5
plt.figure(figsize=(10, 3))
for i in range(num_images):
    plt.subplot(1, num_images, i + 1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(x_train_filtered[i], cmap=plt.cm.binary)
    plt.xlabel(y_train[i])
plt.show()
  • Now let’s train a CNN using the sharpened data set
# Scale the data
x_train_filtered = x_train_filtered / 255
x_test_filtered = x_test_filtered / 255

# Build the model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(x_train_filtered, y_train, epochs=5, batch_size=64, validation_split=0.1)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test_filtered, test_labels, verbose=2)
print('\nTest accuracy:', test_acc)

del x_train
del x_test
del x_train_filtered
del x_test_filtered

Improving Accuracy with Differing Architectures

  • First we will test the addition of regularization and drop-out to the CNN
    • Regularization: Using L1 or L2 norms to control the values of the weights of the network (like LASSO and Ridge regresion)
    • Dropout: randomly setting X% of the weights of a layer to zero
  • Both methods can reduce overfitting and thus increase accuracy
import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import regularizers
from tensorflow.keras.utils import to_categorical

# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Preprocess the data
x_train = np.expand_dims(x_train, axis=-1) / 255.0
x_test = np.expand_dims(x_test, axis=-1) / 255.0

# Create a CNN model with regularization and dropout
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.01), input_shape=(28, 28, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train,
          batch_size=128,
          epochs=5,
          verbose=1,
          validation_data=(x_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test accuracy: {test_acc}')

del x_train
del x_test
  • Ensemble Learning: Train many CNNs and take the average of the CNNS to be the result
    • Also used to reduce overfitting and capture various patterns in the data
  • The ensemble can be made of CNNs of the same architecture (this example) or different architectures
import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.utils import to_categorical

# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Preprocess the data
x_train = np.expand_dims(x_train, axis=-1) / 255.0
x_test = np.expand_dims(x_test, axis=-1) / 255.0
y_train = to_categorical(y_train, num_classes=10)
y_test = to_categorical(y_test, num_classes=10)

# Create multiple CNN models
num_models = 5
models = []

for i in range(num_models):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    models.append(model)

# Train the models
for model in models:
    model.fit(x_train, y_train, batch_size=128, epochs=5, verbose=0)
# Combine predictions from the models

y_preds = np.zeros_like(y_test)

for model in models:
    y_preds += model.predict(x_test)

y_preds /= num_models

# Convert to the prediction (instead of the probabilities)
y_pred = np.argmax(y_preds, axis=-1)
y_test = np.argmax(y_test, axis=-1)

from sklearn.metrics import accuracy_score

print("Ensemble Accuracy:", accuracy_score(y_test, y_pred))

del x_train
del x_test
313/313 [==============================] - 1s 1ms/step
313/313 [==============================] - 0s 1ms/step
Ensemble Accuracy: 0.9811

More Complex Architecture

  • Inception Networks, ResNet, DeepNet
    • Improve computational effeciency with smaller parallel layers instead of larger layers
  • Can train the networks using Google’s ImageNet weights (transfer learning)
  • Not useful (in fact not appliciable) the MNIST data set but work well on larger image dta sets