ARMA Models & Hyperparameter Tuning

In [38]:
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
In [15]:
import inspect
import time
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns

from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA

warnings.filterwarnings("ignore")

GOAL

  • Create an autoregressive moving average (ARMA)model to predict PM2.5 levels.
  • Tune my model hyperparameters to improve performance.

Machine Learning Workflow

  • Prepare Data
    • Import: Add an argument to a function
    • Explore
    • Split: Small training and test sets
  • Build Model
    • Baseline
    • Iterate: ARMA model, hyperparameter tuning (Hyperparameter is a part of model that's set before training occurs and it controls the learning process).
    • Evaluate
  • Communicate Results
    • Time series plot with plotly express.

Prepare Data¶

Import¶

In [16]:
# Create a client to connect to the MongoDB server that assigns the "air-quality" database to db & the "nairobi" connection to nairobi
client = MongoClient(host="localhost", port=27017)
db = client["air-quality"]
nairobi = db["nairobi"]
In [17]:
# Change my wrangle function so that it has a resample_rule argument that allows the user to change the resampling interval

def wrangle(collection, resample_rule="1H"):

    results = collection.find(
        {"metadata.site": 29, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},
    )

    # Read results into DataFrame
    df = pd.DataFrame(list(results)).set_index("timestamp")

    # Localize timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")

    # Remove outliers
    df = df[df["P2"] < 500]

    # Resample and forward-fill
    y = df["P2"].resample("1H").mean().fillna(method="ffill")           # The argument default should be "1H"
    
    return y
In [18]:
# Use my wrangle function to read the data from the nairobi collection into the Series y
y = wrangle(nairobi)
y.head()
Out[18]:
timestamp
2018-09-01 03:00:00+03:00    17.541667
2018-09-01 04:00:00+03:00    15.800000
2018-09-01 05:00:00+03:00    11.420000
2018-09-01 06:00:00+03:00    11.614167
2018-09-01 07:00:00+03:00    17.665000
Freq: H, Name: P2, dtype: float64

Explore¶

In [19]:
# Create an ACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y, ax=ax);
plt.xlabel("Lag [hours]")               # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient");  # Label the y-axis as "Correlation Coefficient"
In [20]:
# Create an PACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_pacf(y, ax=ax); 
plt.xlabel("Lag [hours]")               # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient");  # Label the y-axis as "Correlation Coefficient"

Split¶

In [21]:
# Create a training set y_train that contains only readings from October 2018
y_train = y.loc["2018-10-01":"2018-10-31"]
y_test = y.loc["2018-11-01"]               # Create a test set y_test that contains readings from November 1, 2018

Build Model¶

Baseline¶

In [22]:
# Calculate the baseline mean absolute error for my model
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Mean P2 Reading:", round(y_train_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))
Mean P2 Reading: 10.12
Baseline MAE: 4.17

Iterate¶

In [23]:
# Create ranges for possible  𝑝  and  𝑞  values
p_params = range(0, 25, 8)  # p_params should range between 0 and 25, by steps of 8
q_params = range(0, 3, 1)   # q_params should range between 0 and 3 by steps of 1
In [24]:
list(p_params)
Out[24]:
[0, 8, 16, 24]
In [25]:
list(q_params)
Out[25]:
[0, 1, 2]
In [26]:
# Prepare to train a model with every combination of hyperparameters in p_params and q_params
for p in p_params:
    print(p)
0
8
16
24
In [27]:
for p in p_params:
    for q in q_params:
        order = (p, 0, q)
        print(order)
(0, 0, 0)
(0, 0, 1)
(0, 0, 2)
(8, 0, 0)
(8, 0, 1)
(8, 0, 2)
(16, 0, 0)
(16, 0, 1)
(16, 0, 2)
(24, 0, 0)
(24, 0, 1)
(24, 0, 2)
In [ ]:
for p in p_params:
    for q in q_params:
        order = (p, 0, q)
        start_time = time.time()
        model = ARIMA(y_train, order=order).fit()
        elapsed_time = round(time.time() - start_time, 2)
        print(f"Trained ARIMA model {order} in {elapsed_time} seconds.")
Trained ARIMA model (0, 0, 0) in 0.31 seconds.
Trained ARIMA model (0, 0, 1) in 0.38 seconds.
Trained ARIMA model (0, 0, 2) in 1.69 seconds.
Trained ARIMA model (8, 0, 0) in 10.4 seconds.
Trained ARIMA model (8, 0, 1) in 37.7 seconds.
Trained ARIMA model (8, 0, 2) in 68.3 seconds.
Trained ARIMA model (16, 0, 0) in 36.11 seconds.
Trained ARIMA model (16, 0, 1) in 128.7 seconds.
Trained ARIMA model (16, 0, 2) in 190.1 seconds.
Trained ARIMA model (24, 0, 0) in 99.32 seconds.
In [29]:
time.time()
Out[29]:
1656013950.9323714
In [30]:
# Create dictionary to store MAEs
mae_grid = dict()
# Outer loop: Iterate through possible values for `p`
for p in p_params:
    # Create key-value pair in dict. Key is `p`, value is empty list.
    mae_grid[p] = list()
    # Inner loop: Iterate through possible values for `q`
    for q in q_params:
        # Combination of hyperparameters for model
        order = (p, 0, q)
        # Note start time
        start_time = time.time()
        # Train model
        model = ARIMA(y_train, order=order).fit()
        # Calculate model training time
        elapsed_time = round(time.time() - start_time, 2)
        print(f"Trained ARIMA {order} in {elapsed_time} seconds.")
        # Generate in-sample (training) predictions
        y_pred = model.predict()
        # Calculate training MAE
        mae = mean_absolute_error(y_train, y_pred)    # Every time the model is trained, the mean absolute error is calculated and then saved to a dictionary
        # Append MAE to list in dictionary
        mae_grid[p].append(mae)

print()
print(mae_grid)
Trained ARIMA (0, 0, 0) in 0.29 seconds.
Trained ARIMA (0, 0, 1) in 0.38 seconds.
Trained ARIMA (0, 0, 2) in 1.19 seconds.
Trained ARIMA (8, 0, 0) in 9.4 seconds.
Trained ARIMA (8, 0, 1) in 37.39 seconds.
Trained ARIMA (8, 0, 2) in 69.8 seconds.
Trained ARIMA (16, 0, 0) in 38.31 seconds.
Trained ARIMA (16, 0, 1) in 125.51 seconds.
Trained ARIMA (16, 0, 2) in 194.2 seconds.
Trained ARIMA (24, 0, 0) in 97.89 seconds.
Trained ARIMA (24, 0, 1) in 131.2 seconds.
Trained ARIMA (24, 0, 2) in 251.3 seconds.

{0: [4.171460443827197, 3.3506427433555537, 3.1057222587888647], 8: [2.9384480570404223, 2.9149008203151663, 2.908046094677133], 16: [2.9201084726122, 2.929436207635203, 2.913638894139686], 24: [2.914390327277196, 2.9136013252035013, 2.89792511429827]}
In [31]:
mae_grid
Out[31]:
{0: [4.171460443827197, 3.3506427433555537, 3.1057222587888647],
 8: [2.9384480570404223, 2.9149008203151663, 2.908046094677133],
 16: [2.9201084726122, 2.929436207635203, 2.913638894139686],
 24: [2.914390327277196, 2.9136013252035013, 2.89792511429827]}
In [32]:
# Organize all the MAE's from above in a DataFrame names mae_df
mae_df = pd.DataFrame(mae_grid)
mae_df.round(4)
Out[32]:
0 8 16 24
0 4.1715 2.9384 2.9201 2.9144
1 3.3506 2.9149 2.9294 2.9136
2 3.1057 2.9080 2.9136 2.8979
In [33]:
# Create heatmap of the values in mae_grid
sns.heatmap(mae_df, cmap="Blues")
plt.xlabel("p values")  # Label x-axis "p values"
plt.ylabel("q values")  # Label y-axis "q values"
plt.title("ARMA Grid Search (Criterion: MAE)");
In [34]:
# Use the plot_diagnostics method to check the residuals for my model
fig, ax = plt.subplots(figsize=(15, 12)) 
model.plot_diagnostics(fig=fig);       # Plot will represent the residuals from the last model I trained

Evaluate¶

In [ ]:
# Perform walk-forward validation for my model for the entire test set y_test
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
    model = ARIMA(history, order=(8,0,1)).fit()
    next_pred = model.forecast()
    y_pred_wfv = y_pred_wfv.append(next_pred)   # Store my model's predictions in the Series y_pred_wfv
    history = history.append(y_test[next_pred.index])
In [36]:
test_mae = mean_absolute_error(y_test, y_pred_wfv)
print("Test MAE (walk forward validation):", round(test_mae, 2))
Test MAE (walk forward validation): 1.67

Communicate Results¶

In [37]:
# Generate the list of training predictions for my model
df_predictions = pd.DataFrame({"y_test": y_test, "y_pred_wfv": y_pred_wfv})
fig = px.line(df_predictions, labels={"value": "PM2.5"})
fig.show()