Advanced Methods for Improving the Performance of Convolutional Neural Networks

CSC/DSC 340 Week 11 Slides

Date Created: October 29, 2023

Last Modified: October 29, 2023

Last week we learned how to classify the MNIST data set with convolutional neural networks and how to improve the performance of these CNNs with hyperparameter tuning
This week we will learn how to futher improve the performance using a few preprocessing techniques and a few different architecture styles
First, let’s review the MNIST data set

import tensorflow as tf

# Load MNIST data
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the data
train_images, test_images = train_images / 255.0, test_images / 255.0

# Print the shape of the data
print("Train images shape:", train_images.shape)
print("Train labels shape:", train_labels.shape)
print("Test images shape:", test_images.shape)
print("Test labels shape:", test_labels.shape)

Train images shape: (60000, 28, 28)
Train labels shape: (60000,)
Test images shape: (10000, 28, 28)
Test labels shape: (10000,)

import matplotlib.pyplot as plt
# Display a small number of images
num_images = 5
plt.figure(figsize=(10, 3))
for i in range(num_images):
    plt.subplot(1, num_images, i + 1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
    plt.xlabel(train_labels[i])
plt.show()

Now let’s review how to classify the MNIST data set with a simple CNN

from tensorflow.keras import layers, models

# Build the model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(train_images, train_labels, epochs=5, batch_size=64, validation_split=0.1)

# Evaluate the model
test_loss, test_acc = model.evaluate(test_images, test_labels, verbose=2)
print('\nTest accuracy:', test_acc)

del train_images
del test_images

Epoch 1/5
844/844 [==============================] - 17s 20ms/step - loss: 0.2010 - accuracy: 0.9379 - val_loss: 0.0659 - val_accuracy: 0.9803
Epoch 2/5
844/844 [==============================] - 21s 25ms/step - loss: 0.0547 - accuracy: 0.9829 - val_loss: 0.0395 - val_accuracy: 0.9885
Epoch 3/5
844/844 [==============================] - 18s 21ms/step - loss: 0.0379 - accuracy: 0.9881 - val_loss: 0.0373 - val_accuracy: 0.9898
Epoch 4/5
844/844 [==============================] - 18s 22ms/step - loss: 0.0291 - accuracy: 0.9907 - val_loss: 0.0352 - val_accuracy: 0.9908
Epoch 5/5
844/844 [==============================] - 21s 25ms/step - loss: 0.0233 - accuracy: 0.9922 - val_loss: 0.0379 - val_accuracy: 0.9893
313/313 - 1s - loss: 0.0373 - accuracy: 0.9868 - 1s/epoch - 4ms/step

Test accuracy: 0.9868000149726868

Now let’s attempt to improve this accuracy
- NOTE: these techniques may not work well on the MNIST data set since the images are quite simple but will work better with larger and more complex images

Improving Through Pre-Processing Steps

First let’s increase the size of the data set using data augmentation
- Add images to the data set that are rotated, stretched, translated, and zoomed in versions of the original images
- Should lead to better generalization and thus increased accuracy
Note that for the ImageDataGenerator function the X data (the images) need to be four dimensions instead of 3 dimensions

from tensorflow.keras.preprocessing.image import ImageDataGenerator
import scipy.ndimage

# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Data preprocessing and normalization
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1).astype('float32') / 255
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1).astype('float32') / 255

# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1
)
datagen.fit(x_train)

# Create a simple CNN model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Fit the model with augmented data
model.fit(datagen.flow(x_train, y_train, batch_size=32),
          steps_per_epoch=len(x_train) / 32, epochs=5)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test accuracy: {test_acc}')

del x_train
del x_test

Epoch 1/5
1875/1875 [==============================] - 29s 15ms/step - loss: 0.2803 - accuracy: 0.9118
Epoch 2/5
1207/1875 [==================>...........] - ETA: 9s - loss: 0.1032 - accuracy: 0.9681

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In [4], line 38
     33 model.compile(optimizer='adam',
     34               loss='sparse_categorical_crossentropy',
     35               metrics=['accuracy'])
     37 # Fit the model with augmented data
---> 38 model.fit(datagen.flow(x_train, y_train, batch_size=32),
     39           steps_per_epoch=len(x_train) / 32, epochs=5)
     41 # Evaluate the model
     42 test_loss, test_acc = model.evaluate(x_test, y_test)

File ~/Library/Python/3.9/lib/python/site-packages/keras/src/utils/traceback_utils.py:65, in filter_traceback.<locals>.error_handler(*args, **kwargs)
     63 filtered_tb = None
     64 try:
---> 65     return fn(*args, **kwargs)
     66 except Exception as e:
     67     filtered_tb = _process_traceback_frames(e.__traceback__)

File ~/Library/Python/3.9/lib/python/site-packages/keras/src/engine/training.py:1783, in Model.fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1775 with tf.profiler.experimental.Trace(
   1776     "train",
   1777     epoch_num=epoch,
   (...)
   1780     _r=1,
   1781 ):
   1782     callbacks.on_train_batch_begin(step)
-> 1783     tmp_logs = self.train_function(iterator)
   1784     if data_handler.should_sync:
   1785         context.async_wait()

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/util/traceback_utils.py:150, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    148 filtered_tb = None
    149 try:
--> 150   return fn(*args, **kwargs)
    151 except Exception as e:
    152   filtered_tb = _process_traceback_frames(e.__traceback__)

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:831, in Function.__call__(self, *args, **kwds)
    828 compiler = "xla" if self._jit_compile else "nonXla"
    830 with OptionalXlaContext(self._jit_compile):
--> 831   result = self._call(*args, **kwds)
    833 new_tracing_count = self.experimental_get_tracing_count()
    834 without_tracing = (tracing_count == new_tracing_count)

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:867, in Function._call(self, *args, **kwds)
    864   self._lock.release()
    865   # In this case we have created variables on the first call, so we run the
    866   # defunned version which is guaranteed to never create variables.
--> 867   return tracing_compilation.call_function(
    868       args, kwds, self._no_variable_creation_config
    869   )
    870 elif self._variable_creation_config is not None:
    871   # Release the lock early so that multiple threads can perform the call
    872   # in parallel.
    873   self._lock.release()

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/tracing_compilation.py:139, in call_function(args, kwargs, tracing_options)
    137 bound_args = function.function_type.bind(*args, **kwargs)
    138 flat_inputs = function.function_type.unpack_inputs(bound_args)
--> 139 return function._call_flat(  # pylint: disable=protected-access
    140     flat_inputs, captured_inputs=function.captured_inputs
    141 )

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/concrete_function.py:1264, in ConcreteFunction._call_flat(self, tensor_inputs, captured_inputs)
   1260 possible_gradient_type = gradients_util.PossibleTapeGradientTypes(args)
   1261 if (possible_gradient_type == gradients_util.POSSIBLE_GRADIENT_TYPES_NONE
   1262     and executing_eagerly):
   1263   # No tape is watching; skip to running the function.
-> 1264   return self._inference_function.flat_call(args)
   1265 forward_backward = self._select_forward_and_backward_functions(
   1266     args,
   1267     possible_gradient_type,
   1268     executing_eagerly)
   1269 forward_function, args_with_tangents = forward_backward.forward()

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py:217, in AtomicFunction.flat_call(self, args)
    215 def flat_call(self, args: Sequence[core.Tensor]) -> Any:
    216   """Calls with tensor inputs and returns the structured output."""
--> 217   flat_outputs = self(*args)
    218   return self.function_type.pack_output(flat_outputs)

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py:252, in AtomicFunction.__call__(self, *args)
    250 with record.stop_recording():
    251   if self._bound_context.executing_eagerly():
--> 252     outputs = self._bound_context.call_function(
    253         self.name,
    254         list(args),
    255         len(self.function_type.flat_outputs),
    256     )
    257   else:
    258     outputs = make_call_op_in_graph(
    259         self,
    260         list(args),
    261         self._bound_context.function_call_options.as_attrs(),
    262     )

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/context.py:1479, in Context.call_function(self, name, tensor_inputs, num_outputs)
   1477 cancellation_context = cancellation.context()
   1478 if cancellation_context is None:
-> 1479   outputs = execute.execute(
   1480       name.decode("utf-8"),
   1481       num_outputs=num_outputs,
   1482       inputs=tensor_inputs,
   1483       attrs=attrs,
   1484       ctx=self,
   1485   )
   1486 else:
   1487   outputs = execute.execute_with_cancellation(
   1488       name.decode("utf-8"),
   1489       num_outputs=num_outputs,
   (...)
   1493       cancellation_manager=cancellation_context,
   1494   )

File ~/Library/Python/3.9/lib/python/site-packages/tensorflow/python/eager/execute.py:60, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     53   # Convert any objects of type core_types.Tensor to Tensor.
     54   inputs = [
     55       tensor_conversion_registry.convert(t)
     56       if isinstance(t, core_types.Tensor)
     57       else t
     58       for t in inputs
     59   ]
---> 60   tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
     61                                       inputs, attrs, num_outputs)
     62 except core._NotOkStatusException as e:
     63   if name is not None:

KeyboardInterrupt:

Next we will attempt to increase the accuracy by sharpening the edges of the digits by removing pixels that are light grey
For more complex images, there are algorithms for edge sharpening, deblurring, etc. that can be applied in a similar manner

import numpy as np
# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Function to remove gray images
def remove_gray_images(images, threshold=5):
    images_redone = []
    for image in images:
        for i in range(28):
            for j in range(28):
                if image[i][j] < 255-threshold:
                    image[i][j] = 0
        images_redone.append(image)
    images_redone = np.asarray(images_redone)
    return images_redone

# Remove gray images from the training set
x_train_filtered = remove_gray_images(x_train, 55)

# Remove gray images from the test set
x_test_filtered = remove_gray_images(x_test, 55)

The original images

(x_train, y_train), (x_test, y_test) = mnist.load_data()
# Display a small number of images
num_images = 5
plt.figure(figsize=(10, 3))
for i in range(num_images):
    plt.subplot(1, num_images, i + 1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(x_train[i], cmap=plt.cm.binary)
    plt.xlabel(y_train[i])
plt.show()

The sharpened images

# Display a small number of images
num_images = 5
plt.figure(figsize=(10, 3))
for i in range(num_images):
    plt.subplot(1, num_images, i + 1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(x_train_filtered[i], cmap=plt.cm.binary)
    plt.xlabel(y_train[i])
plt.show()

Now let’s train a CNN using the sharpened data set

# Scale the data
x_train_filtered = x_train_filtered / 255
x_test_filtered = x_test_filtered / 255

# Build the model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(x_train_filtered, y_train, epochs=5, batch_size=64, validation_split=0.1)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test_filtered, test_labels, verbose=2)
print('\nTest accuracy:', test_acc)

del x_train
del x_test
del x_train_filtered
del x_test_filtered

Improving Accuracy with Differing Architectures

First we will test the addition of regularization and drop-out to the CNN
- Regularization: Using L1 or L2 norms to control the values of the weights of the network (like LASSO and Ridge regresion)
- Dropout: randomly setting X% of the weights of a layer to zero
Both methods can reduce overfitting and thus increase accuracy

import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import regularizers
from tensorflow.keras.utils import to_categorical

# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Preprocess the data
x_train = np.expand_dims(x_train, axis=-1) / 255.0
x_test = np.expand_dims(x_test, axis=-1) / 255.0

# Create a CNN model with regularization and dropout
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.01), input_shape=(28, 28, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train,
          batch_size=128,
          epochs=5,
          verbose=1,
          validation_data=(x_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test accuracy: {test_acc}')

del x_train
del x_test

Ensemble Learning: Train many CNNs and take the average of the CNNS to be the result
- Also used to reduce overfitting and capture various patterns in the data
The ensemble can be made of CNNs of the same architecture (this example) or different architectures

import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.utils import to_categorical

# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Preprocess the data
x_train = np.expand_dims(x_train, axis=-1) / 255.0
x_test = np.expand_dims(x_test, axis=-1) / 255.0
y_train = to_categorical(y_train, num_classes=10)
y_test = to_categorical(y_test, num_classes=10)

# Create multiple CNN models
num_models = 5
models = []

for i in range(num_models):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    models.append(model)

# Train the models
for model in models:
    model.fit(x_train, y_train, batch_size=128, epochs=5, verbose=0)
# Combine predictions from the models

y_preds = np.zeros_like(y_test)

for model in models:
    y_preds += model.predict(x_test)

y_preds /= num_models

# Convert to the prediction (instead of the probabilities)
y_pred = np.argmax(y_preds, axis=-1)
y_test = np.argmax(y_test, axis=-1)

from sklearn.metrics import accuracy_score

print("Ensemble Accuracy:", accuracy_score(y_test, y_pred))

del x_train
del x_test

313/313 [==============================] - 1s 1ms/step
313/313 [==============================] - 0s 1ms/step
Ensemble Accuracy: 0.9811

More Complex Architecture

Inception Networks, ResNet, DeepNet
- Improve computational effeciency with smaller parallel layers instead of larger layers
Can train the networks using Google’s ImageNet weights (transfer learning)
Not useful (in fact not appliciable) the MNIST data set but work well on larger image dta sets