Recurrent Neural Networks#
CSC/DSC 340 Week 12 Slides
Author: Dr. Julie Butler
Date Created: November 4, 2023
Last Modified: November 5, 2023
Previously#
We have been using regular neural networks to perform interpolation
import tensorflow as tf
import numpy as np
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import TimeseriesGenerator
# Generate training data
x_train = np.random.uniform(-3*np.pi, 3*np.pi, 1000) # Generate 1000 random numbers between -1 and 1
y_train = np.sin(x_train) # Get corresponding function values
# Create the neural network model
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(1,)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1)
])
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
model.fit(x_train, y_train, epochs=500, verbose=0)
# Test the model
x_test = np.random.uniform(-3*np.pi, 3*np.pi, 100)
y_test = np.sin(x_test)
y_pred = model.predict(x_test)
print(mse(y_test, y_pred))
plt.scatter(x_train, y_train)
plt.scatter(x_test, y_pred)
1/4 [======>.......................] - ETA: 0s
4/4 [==============================] - 0s 1ms/step
0.0007015685019484169
<matplotlib.collections.PathCollection at 0x175601c10>

While regular neural networks perform well at interpolation, they tend to perform poorly when asked to extrapolate (this is a fact that is true of most machine learning algorithms)
X_train = np.arange(0,100,0.5)
y_train = np.sin(X_train)
X_test = np.arange(90,200,0.5)
y_test = np.sin(X_test)
# Create the neural network model
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(1,)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1)
])
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
model.fit(X_train, y_train, epochs=500, verbose=0)
y_pred = model.predict(X_test)
print(mse(y_test, y_pred))
plt.plot(X_train, y_train)
plt.plot(X_test, y_test)
plt.plot(X_test, y_pred, color='green')
1/7 [===>..........................] - ETA: 0s
7/7 [==============================] - 0s 590us/step
1.154618569497259
[<matplotlib.lines.Line2D at 0x176fd5bb0>]

Many extrapolation cases with machine learning use recurrent neural networks
While traditional neural networks are feedforward (data only passes from input layer to output layer) recurrent neural networks have a memory that feeds information backwards
Note that the input data for an RNN for both training and testing needs to be three dimensional.
SimpleRNN
has the same arguments asDense
where the number is the number of neurons and we can set the activation function.Note that you still need at least one
Dense
layer at the end of the network to “post-process” the results
X_train = np.arange(0,100,0.5)
y_train = np.sin(X_train)
X_test = np.arange(90,200,0.5)
y_test = np.sin(X_test)
# Preprocess the data for the RNN
X_train = X_train.reshape(-1, 1, 1) # Reshape the input data for RNN
# Create the RNN model
model = tf.keras.Sequential([
tf.keras.layers.SimpleRNN(32, activation='relu', input_shape=(1, 1)),
tf.keras.layers.Dense(1)
])
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
model.fit(X_train, y_train, epochs=100, verbose=0)
X_test = X_test.reshape(-1, 1, 1)
y_pred = model.predict(X_test)
print(mse(y_test.flatten(), y_pred))
plt.plot(X_train.flatten(), y_train)
plt.plot(X_test.flatten(), y_test)
plt.plot(X_test.flatten(), y_pred, color='green')
1/7 [===>..........................] - ETA: 0s
7/7 [==============================] - 0s 638us/step
0.5288347596842933
[<matplotlib.lines.Line2D at 0x176b29fa0>]

This network did not perform well at extrapolation, so let’s try some hyperparameter tuning
Try adding more
SimpleRNN
layers which will result in more recurrent neurons (more memory)
X_train = np.arange(0,100,0.5)
y_train = np.sin(X_train)
X_test = np.arange(90,200,0.5)
y_test = np.sin(X_test)
# Preprocess the data for the RNN
X_train = X_train.reshape(-1, 1, 1) # Reshape the input data for RNN
# Create the RNN model
model = tf.keras.Sequential([
tf.keras.layers.SimpleRNN(32, activation='relu', return_sequences=True, input_shape=(1, 1)),
tf.keras.layers.SimpleRNN(32, activation='relu',return_sequences=True),
tf.keras.layers.Dense(1)
])
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
model.fit(X_train, y_train, epochs=100, verbose=0)
X_test = X_test.reshape(-1, 1, 1)
y_pred = model.predict(X_test)
print(mse(y_test, y_pred.flatten()))
plt.plot(X_train.flatten(), y_train)
plt.plot(X_test.flatten(), y_test)
plt.plot(X_test.flatten(), y_pred.flatten(), color='green')
1/7 [===>..........................] - ETA: 0s
7/7 [==============================] - 0s 772us/step
0.5173149708723822
[<matplotlib.lines.Line2D at 0x176d6f5b0>]

Some applications of RNNs have shown that in addition to using
Dense
layers to post-process the results of RNN layers,Dense
layers can also be used to pre-process the results before they reach the RNN layers
X_train = np.arange(0,100,0.5)
y_train = np.sin(X_train)
X_test = np.arange(90,200,0.5)
y_test = np.sin(X_test)
# Preprocess the data for the RNN
X_train = X_train.reshape(-1, 1, 1) # Reshape the input data for RNN
# Create the RNN model
model = tf.keras.Sequential([
tf.keras.layers.Dense(32, activation='relu', input_shape=(1, 1)),
tf.keras.layers.SimpleRNN(32, activation='relu', return_sequences=True),
tf.keras.layers.SimpleRNN(32, activation='relu',return_sequences=True),
tf.keras.layers.Dense(1)
])
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
model.fit(X_train, y_train, epochs=100, verbose=0)
X_test = X_test.reshape(-1, 1, 1)
y_pred = model.predict(X_test)
print(mse(y_test, y_pred.flatten()))
plt.plot(X_train.flatten(), y_train)
plt.plot(X_test.flatten(), y_test)
plt.plot(X_test.flatten(), y_pred.flatten(), color='green')
1/7 [===>..........................] - ETA: 1s
7/7 [==============================] - 0s 815us/step
0.5700624385641841
[<matplotlib.lines.Line2D at 0x290c1f340>]

SimpleRNN
layers are not working too well at this task, so let’s attempt to try another type of RNN layerLSTM
(long short term memory) layers are improved RNN layersIt maintains long-term dependencies in the data by replacing a simple neuron with a memory cell that can store information over time
The memory is maintained through a series of logic gates
Well suited for applications with ordered data such as natural language processing
X_train = np.arange(0,100,0.5)
y_train = np.sin(X_train)
X_test = np.arange(90,200,0.5)
y_test = np.sin(X_test)
# Preprocess the data for the RNN
X_train = X_train.reshape(-1, 1, 1) # Reshape the input data for RNN
# Create the RNN model
model = tf.keras.Sequential([
tf.keras.layers.LSTM(32, activation='relu', return_sequences=True, input_shape=(1, 1)),
tf.keras.layers.LSTM(32, activation='relu',return_sequences=True),
tf.keras.layers.Dense(1)
])
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
model.fit(X_train, y_train, epochs=100, verbose=0)
X_test = X_test.reshape(-1, 1, 1)
y_pred = model.predict(X_test)
print(mse(y_test, y_pred.flatten()))
plt.plot(X_train.flatten(), y_train)
plt.plot(X_test.flatten(), y_test)
plt.plot(X_test.flatten(), y_pred.flatten(), color='green')
1/7 [===>..........................] - ETA: 0s
7/7 [==============================] - 0s 822us/step
0.5052410511806004
[<matplotlib.lines.Line2D at 0x1765e0fd0>]

Another, improved RNN layer is called a GRU (gated recurrent unit)
Perform similarly to LSTM layers and are used in many of the same applications, but are less prone to the vanishing gradient problem
X_train = np.arange(0,100,0.5)
y_train = np.sin(X_train)
X_test = np.arange(90,200,0.5)
y_test = np.sin(X_test)
# Preprocess the data for the RNN
X_train = X_train.reshape(-1, 1, 1) # Reshape the input data for RNN
# Create the RNN model
model = tf.keras.Sequential([
tf.keras.layers.GRU(32, activation='relu', return_sequences=True, input_shape=(1, 1)),
tf.keras.layers.GRU(32, activation='relu',return_sequences=True),
tf.keras.layers.Dense(1)
])
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
model.fit(X_train, y_train, epochs=100, verbose=0)
X_test = X_test.reshape(-1, 1, 1)
y_pred = model.predict(X_test)
print(mse(y_test, y_pred.flatten()))
plt.plot(X_train.flatten(), y_train)
plt.plot(X_test.flatten(), y_test)
plt.plot(X_test.flatten(), y_pred.flatten(), color='green')
1/7 [===>..........................] - ETA: 0s
7/7 [==============================] - 0s 828us/step
0.5355605067324839
[<matplotlib.lines.Line2D at 0x2944fb880>]

All these RNN networks are performing poorly on what should be an easy to recognize long term pattern
Perhaps the problem is not the RNNs but on how we are trying to train them.
Traditional Training: \(f(x_i) = y_i\)
Time Series Training: \(f(y_{i-3}, y_{i-2}, y_{i-1}) = y_i\)
Limitation: the data must be evenly spaced by some sort of input variable (this is now a hidden variable)
Let’s try reformatting our data as a time series and training a very simple RNN
n_features = 1
train_series = y_train.reshape((len(y_train), n_features))
test_series = y_test.reshape((len(y_test), n_features))
seq = 20
train_generator = TimeseriesGenerator(train_series, train_series,
length = seq,
sampling_rate = 1,
stride = 1,
batch_size = 10)
test_generator = TimeseriesGenerator(test_series, test_series,
length = seq,
sampling_rate = 1,
stride = 1,
batch_size = 10)
n_neurons = 4
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(n_neurons, input_shape=(seq, n_features)))
model.add(tf.keras.layers.Dense(1))
model.compile(optimizer='adam', loss='mse')
model.fit(train_generator,epochs=300, verbose=0)
y_pred = model.predict(test_generator)
print(mse(y_test[20:], y_pred.flatten()))
plt.plot(X_train.flatten(),y_train)
plt.plot(X_test.flatten(),y_test)
plt.plot(X_test[20:].flatten(),y_pred)
1/20 [>.............................] - ETA: 4s
20/20 [==============================] - 0s 1ms/step
3.928643762298923e-06
[<matplotlib.lines.Line2D at 0x2952cefa0>]

Even with an exceedingly small and simple RNN, it was able to capture the patterns in the data using the time series data formatting.
This is how all RNNs are taught sequential data, even text/natural language processing (more on this next week!)