ARMA Models & Hyperparameter Tuning
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
import inspect
import time
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
warnings.filterwarnings("ignore")
GOAL
Machine Learning Workflow
# Create a client to connect to the MongoDB server that assigns the "air-quality" database to db & the "nairobi" connection to nairobi
client = MongoClient(host="localhost", port=27017)
db = client["air-quality"]
nairobi = db["nairobi"]
# Change my wrangle function so that it has a resample_rule argument that allows the user to change the resampling interval
def wrangle(collection, resample_rule="1H"):
results = collection.find(
{"metadata.site": 29, "metadata.measurement": "P2"},
projection={"P2": 1, "timestamp": 1, "_id": 0},
)
# Read results into DataFrame
df = pd.DataFrame(list(results)).set_index("timestamp")
# Localize timezone
df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")
# Remove outliers
df = df[df["P2"] < 500]
# Resample and forward-fill
y = df["P2"].resample("1H").mean().fillna(method="ffill") # The argument default should be "1H"
return y
# Use my wrangle function to read the data from the nairobi collection into the Series y
y = wrangle(nairobi)
y.head()
timestamp 2018-09-01 03:00:00+03:00 17.541667 2018-09-01 04:00:00+03:00 15.800000 2018-09-01 05:00:00+03:00 11.420000 2018-09-01 06:00:00+03:00 11.614167 2018-09-01 07:00:00+03:00 17.665000 Freq: H, Name: P2, dtype: float64
# Create an ACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y, ax=ax);
plt.xlabel("Lag [hours]") # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient"); # Label the y-axis as "Correlation Coefficient"
# Create an PACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_pacf(y, ax=ax);
plt.xlabel("Lag [hours]") # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient"); # Label the y-axis as "Correlation Coefficient"
# Create a training set y_train that contains only readings from October 2018
y_train = y.loc["2018-10-01":"2018-10-31"]
y_test = y.loc["2018-11-01"] # Create a test set y_test that contains readings from November 1, 2018
# Calculate the baseline mean absolute error for my model
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Mean P2 Reading:", round(y_train_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))
Mean P2 Reading: 10.12 Baseline MAE: 4.17
# Create ranges for possible 𝑝 and 𝑞 values
p_params = range(0, 25, 8) # p_params should range between 0 and 25, by steps of 8
q_params = range(0, 3, 1) # q_params should range between 0 and 3 by steps of 1
list(p_params)
[0, 8, 16, 24]
list(q_params)
[0, 1, 2]
# Prepare to train a model with every combination of hyperparameters in p_params and q_params
for p in p_params:
print(p)
0 8 16 24
for p in p_params:
for q in q_params:
order = (p, 0, q)
print(order)
(0, 0, 0) (0, 0, 1) (0, 0, 2) (8, 0, 0) (8, 0, 1) (8, 0, 2) (16, 0, 0) (16, 0, 1) (16, 0, 2) (24, 0, 0) (24, 0, 1) (24, 0, 2)
for p in p_params:
for q in q_params:
order = (p, 0, q)
start_time = time.time()
model = ARIMA(y_train, order=order).fit()
elapsed_time = round(time.time() - start_time, 2)
print(f"Trained ARIMA model {order} in {elapsed_time} seconds.")
Trained ARIMA model (0, 0, 0) in 0.31 seconds. Trained ARIMA model (0, 0, 1) in 0.38 seconds. Trained ARIMA model (0, 0, 2) in 1.69 seconds. Trained ARIMA model (8, 0, 0) in 10.4 seconds. Trained ARIMA model (8, 0, 1) in 37.7 seconds. Trained ARIMA model (8, 0, 2) in 68.3 seconds. Trained ARIMA model (16, 0, 0) in 36.11 seconds. Trained ARIMA model (16, 0, 1) in 128.7 seconds. Trained ARIMA model (16, 0, 2) in 190.1 seconds. Trained ARIMA model (24, 0, 0) in 99.32 seconds.
time.time()
1656013950.9323714
# Create dictionary to store MAEs
mae_grid = dict()
# Outer loop: Iterate through possible values for `p`
for p in p_params:
# Create key-value pair in dict. Key is `p`, value is empty list.
mae_grid[p] = list()
# Inner loop: Iterate through possible values for `q`
for q in q_params:
# Combination of hyperparameters for model
order = (p, 0, q)
# Note start time
start_time = time.time()
# Train model
model = ARIMA(y_train, order=order).fit()
# Calculate model training time
elapsed_time = round(time.time() - start_time, 2)
print(f"Trained ARIMA {order} in {elapsed_time} seconds.")
# Generate in-sample (training) predictions
y_pred = model.predict()
# Calculate training MAE
mae = mean_absolute_error(y_train, y_pred) # Every time the model is trained, the mean absolute error is calculated and then saved to a dictionary
# Append MAE to list in dictionary
mae_grid[p].append(mae)
print()
print(mae_grid)
Trained ARIMA (0, 0, 0) in 0.29 seconds. Trained ARIMA (0, 0, 1) in 0.38 seconds. Trained ARIMA (0, 0, 2) in 1.19 seconds. Trained ARIMA (8, 0, 0) in 9.4 seconds. Trained ARIMA (8, 0, 1) in 37.39 seconds. Trained ARIMA (8, 0, 2) in 69.8 seconds. Trained ARIMA (16, 0, 0) in 38.31 seconds. Trained ARIMA (16, 0, 1) in 125.51 seconds. Trained ARIMA (16, 0, 2) in 194.2 seconds. Trained ARIMA (24, 0, 0) in 97.89 seconds. Trained ARIMA (24, 0, 1) in 131.2 seconds. Trained ARIMA (24, 0, 2) in 251.3 seconds. {0: [4.171460443827197, 3.3506427433555537, 3.1057222587888647], 8: [2.9384480570404223, 2.9149008203151663, 2.908046094677133], 16: [2.9201084726122, 2.929436207635203, 2.913638894139686], 24: [2.914390327277196, 2.9136013252035013, 2.89792511429827]}
mae_grid
{0: [4.171460443827197, 3.3506427433555537, 3.1057222587888647], 8: [2.9384480570404223, 2.9149008203151663, 2.908046094677133], 16: [2.9201084726122, 2.929436207635203, 2.913638894139686], 24: [2.914390327277196, 2.9136013252035013, 2.89792511429827]}
# Organize all the MAE's from above in a DataFrame names mae_df
mae_df = pd.DataFrame(mae_grid)
mae_df.round(4)
0 | 8 | 16 | 24 | |
---|---|---|---|---|
0 | 4.1715 | 2.9384 | 2.9201 | 2.9144 |
1 | 3.3506 | 2.9149 | 2.9294 | 2.9136 |
2 | 3.1057 | 2.9080 | 2.9136 | 2.8979 |
# Create heatmap of the values in mae_grid
sns.heatmap(mae_df, cmap="Blues")
plt.xlabel("p values") # Label x-axis "p values"
plt.ylabel("q values") # Label y-axis "q values"
plt.title("ARMA Grid Search (Criterion: MAE)");
# Use the plot_diagnostics method to check the residuals for my model
fig, ax = plt.subplots(figsize=(15, 12))
model.plot_diagnostics(fig=fig); # Plot will represent the residuals from the last model I trained
# Perform walk-forward validation for my model for the entire test set y_test
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
model = ARIMA(history, order=(8,0,1)).fit()
next_pred = model.forecast()
y_pred_wfv = y_pred_wfv.append(next_pred) # Store my model's predictions in the Series y_pred_wfv
history = history.append(y_test[next_pred.index])
test_mae = mean_absolute_error(y_test, y_pred_wfv)
print("Test MAE (walk forward validation):", round(test_mae, 2))
Test MAE (walk forward validation): 1.67
# Generate the list of training predictions for my model
df_predictions = pd.DataFrame({"y_test": y_test, "y_pred_wfv": y_pred_wfv})
fig = px.line(df_predictions, labels={"value": "PM2.5"})
fig.show()