4.4 Recurrent Neural Networks: Processing sequences
Contents
4.4 Recurrent Neural Networks: Processing sequences#
They are used for time series forecast. Regular dense networks can also do it, and CNNs can also work for very long time series.
A recurrent neuron receives an input and the output from the neuron at the previous time step. Because each neuron learns from the previous time step, it has memory; but these simple cells have relatively short memory (10 cells about). RNNs take in a sequence and output a sequence.
From D2DL: example of an RNN with a hidden state. The RNN takes the multiplication of the weights and the data plus other weights with the hidden states to the next layer.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
In time series forecasting, the goal is to predict future values based on historical data. Two important concepts in this process are context data and prediction length.
Context Data refers to the historical data used by the forecasting model to make predictions. This data provides the context or background information needed to understand the patterns, trends, and seasonality in the time series. The length of the context data is often referred to as the look-back window or input window.
Prediction length refers to the number of future time steps the model aims to predict. This is also known as the forecast horizon. The number of future time steps the model predicts. For example, if the forecast horizon is 5, the model predicts the next 5 time steps.
Synthetic data
Let’s create a synthetic time series. We will overlap 2 frequencies with noise.
def generate_time_series(batch_size,n_steps):
f1,f2,off1,off2=np.random.rand(4,batch_size,1)
t = np.linspace(0,1,n_steps)
y = 0.5*np.sin( (t-off1)*(f1*10+10) ) # first wave
y += 0.5*np.sin( (t-off2)*(f2*20+20) ) # second wave
y += 0.3* (np.random.rand(batch_size,n_steps)-0.5) # noise
return y.astype(np.float32)
# we generate 10k time series of context+prediction_horizon points.
context=50
prediction_horizon=10
data = generate_time_series(10000,context+prediction_horizon)
# plot some of the time series in a 2x3 grid
plt.figure(figsize=(6,4))
for i in range(6):
plt.subplot(2,3,i+1)
# plot context data
plt.plot(range(context),data[i,:context],'b-',label='context')
# plot prediction horizon
plt.plot(range(context,context+prediction_horizon),data[i,context:],'r-',label='prediction')
plt.grid(True)
plt.tight_layout()
plt.show()
Design a vanilla RNN#
The RNN will take context
class VanillaRNN(torch.nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(VanillaRNN, self).__init__()
self.rnn = torch.nn.RNN(input_size, hidden_size, batch_first=True) # RNN layer, batch_size is the first dimension
self.fc = torch.nn.Linear(hidden_size, output_size) # Fully connected layer
def forward(self, x):
out, _ = self.rnn(x) # out: tensor of shape (batch_size, seq_length, hidden_size)
out = self.fc(out[:, -1, :]) # out: tensor of shape (batch_size, output_size)
return out
def create_rnn_and_data(time_series, context, prediction_horizon):
input_size = time_series.shape[-1]
hidden_size = 20 # You can adjust this value
output_size = prediction_horizon
model = VanillaRNN(input_size, hidden_size, output_size)
# Prepare the input and target data
x = time_series[:, :context,np.newaxis ] # Add a dummy dimension for the input size
y = time_series[:, context:context + prediction_horizon]
return model, x, y
model, x, y = create_rnn_and_data(data, context, prediction_horizon)
# Print model summary
print(model)
Now prepare the training function
def train(model, x_train, y_train, x_val, y_val, n_epochs=100, lr=0.001):
'''
Function to train the model.
Parameters:
model: torch.nn.Module
The model to train
x_train: np.ndarray
The input data for training (context)
y_train: np.ndarray
The target data for training (prediction horizon)
x_val: np.ndarray
The input data for validation (context)
y_val: np.ndarray
The target data for validation (prediction horizon)
n_epochs: int
The number of epochs
lr: float
The learning rate
Returns:
train_losses: list
The training losses for each epoch
val_losses: list
The validation losses for each epoch
'''
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = torch.nn.MSELoss()
train_losses = []
val_losses = []
for epoch in range(n_epochs):
model.train()
optimizer.zero_grad()
y_pred = model(torch.from_numpy(x_train).float())
print(f"Shape of outputs: {y_pred.shape}")
print(f"Shape of y_train: {y_train.shape}")
loss = loss_fn(y_pred, torch.from_numpy(y_train).float())
train_losses.append(loss.item())
loss.backward()
optimizer.step()
if epoch % 10 == 0:
model.eval()
with torch.no_grad():
y_val_pred = model(torch.from_numpy(x_val).float())
print(f"Shape of val_outputs: {y_val_pred.shape}")
print(f"Shape of y_val: {y_val.shape}")
val_loss = loss_fn(y_val_pred, torch.from_numpy(y_val).float())
val_losses.append(val_loss.item())
print(f'Epoch {epoch}, Loss {loss.item():.6f}, Val loss {val_loss.item():.6f}')
return train_losses, val_losses
Train the VanillaRNN by first splitting the data between training and validation data
y.shape
prediction_horizon, context
data.shape
# split y into training and validation sets
n_train = 7000
x_train, y_train = data[:n_train, :context], data[:n_train, context:context + prediction_horizon]
x_val, y_val = data[n_train:, :context], data[n_train:, context:context + prediction_horizon]
x_train.shape, y_train.shape, x_val.shape, y_val.shape
# remove the last dimnsion of y_train and y_val
y_train = y_train.squeeze()
y_val = y_val.squeeze()
train_losses, val_losses = train(model, x_train, y_train, x_val, y_val, n_epochs=100, lr=0.001)
# plot the loss curves
plt.plot(train_losses, label='train')
plt.plot(val_losses, label='val')
plt.legend()
plt.grid(True)
plt.yscale('log')
y_val.shape
# now show examples on the validation set with the ground truth and the prediction
# add a dimension to x_val
# x_val = x_val[...,np.newaxis]
model.eval()
with torch.no_grad():
y_val_pred = model(torch.from_numpy(x_val).float()).numpy()
# plot some of the time series in a 2x3 grid
plt.figure(figsize=(6,4))
for i in range(6):
plt.subplot(2,3,i+1)
plt.plot(np.concatenate([x_val[i],y_val[i]]), label='GT')
plt.plot(np.concatenate([x_val[i],y_val_pred[i,np.newaxis].T]), label='Prediction')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
Forecast of several steps ahead: how far can you predict the future?#
We will try and predict 10 steps ahead. The early part of the forecast will be a lot better than the later part of the forecast as uncertainties increase.
# we generate 10k time series of 51 points.
n_steps=50
x = generate_time_series(10000,n_steps+10)
y=np.empty((10000,n_steps,10))
for step_ahead in range(1,10+1):
y[:,:,step_ahead-1]=x[:,step_ahead:step_ahead+n_steps,0]
x_train=x[:7000,:n_steps]
x_val=x[7000:9000,:n_steps]
x_test=x[9000:,:n_steps]
y_train=y[:7000]
y_val=y[7000:9000]
y_test=y[9000:]
model=keras.models.Sequential([
keras.layers.SimpleRNN(20,input_shape=[None,1],return_sequences=True),
keras.layers.SimpleRNN(20,return_sequences=True),
keras.layers.TimeDistributed(keras.layers.Dense(10))
])
model.summary()
model.compile(optimizer='adam',loss='mse',metrics=['mse'])
history=model.fit(x_train,y_train,validation_data=(x_val,y_val), epochs=20, batch_size=128)
y_pred=model.predict(x_test)
print(y_pred.shape)
print(x_test.shape)
print(y_test.shape)
plt.plot(np.arange(n_steps+10),x[9000,:])
plt.plot(np.arange(n_steps),x_test[0,:])
plt.plot(np.arange(10)+n_steps,y_pred[0,-1,:],'+')
plt.legend(('Truth','past','future'))
plt.grid(True)
Problems with RNNs and solutions#
Simple RNNs have issues during the training with backpropagation that gradients may become too small and that the model no longer updates during training. This is called the vanishing gradient problem.
To remedy this, the algorithm “LSTM” introduces a memory-cell and gating to allow and reset the values and avoid vanishing gradients.
2. LSTM#
Long-Short Term Memory are (somewhat complicated) cells that aims to solve the memory loss issue.
An LSTM combines hidden state from the previous layers, the memory of the internal state, and the input data to output the current hidden and internal states.
model=keras.models.Sequential([
keras.layers.LSTM(20,input_shape=[None,1],return_sequences=True),
keras.layers.LSTM(20,return_sequences=True),
keras.layers.TimeDistributed(keras.layers.Dense(10))
])
model.summary()
model.compile(optimizer='adam',loss='mse',metrics=['mse'])
history=model.fit(x_train,y_train,validation_data=(x_val,y_val), epochs=20, batch_size=128)
y_pred=model.predict(x_test)
plt.plot(np.arange(n_steps+10),x[9000,:])
plt.plot(np.arange(n_steps),x_test[0,:])
plt.plot(np.arange(10)+n_steps,y_pred[0,-1,:],'+')
plt.legend(('Truth','past','future'))
plt.grid(True)