__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import inspect
import time
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns

from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA

warnings.filterwarnings("ignore")


# Create a client to connect to the MongoDB server that assigns the "air-quality" database to db & the "nairobi" connection to nairobi
client = MongoClient(host="localhost", port=27017)
db = client["air-quality"]
nairobi = db["nairobi"]


# Change my wrangle function so that it has a resample_rule argument that allows the user to change the resampling interval

def wrangle(collection, resample_rule="1H"):

    results = collection.find(
        {"metadata.site": 29, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},
    )

    # Read results into DataFrame
    df = pd.DataFrame(list(results)).set_index("timestamp")

    # Localize timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")

    # Remove outliers
    df = df[df["P2"] < 500]

    # Resample and forward-fill
    y = df["P2"].resample("1H").mean().fillna(method="ffill")           # The argument default should be "1H"
    
    return y


# Use my wrangle function to read the data from the nairobi collection into the Series y
y = wrangle(nairobi)
y.head()

timestamp
2018-09-01 03:00:00+03:00    17.541667
2018-09-01 04:00:00+03:00    15.800000
2018-09-01 05:00:00+03:00    11.420000
2018-09-01 06:00:00+03:00    11.614167
2018-09-01 07:00:00+03:00    17.665000
Freq: H, Name: P2, dtype: float64


# Create an ACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y, ax=ax);
plt.xlabel("Lag [hours]")               # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient");  # Label the y-axis as "Correlation Coefficient"


# Create an PACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_pacf(y, ax=ax); 
plt.xlabel("Lag [hours]")               # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient");  # Label the y-axis as "Correlation Coefficient"


# Create a training set y_train that contains only readings from October 2018
y_train = y.loc["2018-10-01":"2018-10-31"]
y_test = y.loc["2018-11-01"]               # Create a test set y_test that contains readings from November 1, 2018


# Calculate the baseline mean absolute error for my model
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Mean P2 Reading:", round(y_train_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))

Mean P2 Reading: 10.12
Baseline MAE: 4.17


# Create ranges for possible  𝑝  and  𝑞  values
p_params = range(0, 25, 8)  # p_params should range between 0 and 25, by steps of 8
q_params = range(0, 3, 1)   # q_params should range between 0 and 3 by steps of 1


list(p_params)

[0, 8, 16, 24]


list(q_params)

[0, 1, 2]


# Prepare to train a model with every combination of hyperparameters in p_params and q_params
for p in p_params:
    print(p)

0
8
16
24


for p in p_params:
    for q in q_params:
        order = (p, 0, q)
        print(order)

(0, 0, 0)
(0, 0, 1)
(0, 0, 2)
(8, 0, 0)
(8, 0, 1)
(8, 0, 2)
(16, 0, 0)
(16, 0, 1)
(16, 0, 2)
(24, 0, 0)
(24, 0, 1)
(24, 0, 2)


for p in p_params:
    for q in q_params:
        order = (p, 0, q)
        start_time = time.time()
        model = ARIMA(y_train, order=order).fit()
        elapsed_time = round(time.time() - start_time, 2)
        print(f"Trained ARIMA model {order} in {elapsed_time} seconds.")

Trained ARIMA model (0, 0, 0) in 0.31 seconds.
Trained ARIMA model (0, 0, 1) in 0.38 seconds.
Trained ARIMA model (0, 0, 2) in 1.69 seconds.
Trained ARIMA model (8, 0, 0) in 10.4 seconds.
Trained ARIMA model (8, 0, 1) in 37.7 seconds.
Trained ARIMA model (8, 0, 2) in 68.3 seconds.
Trained ARIMA model (16, 0, 0) in 36.11 seconds.
Trained ARIMA model (16, 0, 1) in 128.7 seconds.
Trained ARIMA model (16, 0, 2) in 190.1 seconds.
Trained ARIMA model (24, 0, 0) in 99.32 seconds.


time.time()

1656013950.9323714


# Create dictionary to store MAEs
mae_grid = dict()
# Outer loop: Iterate through possible values for `p`
for p in p_params:
    # Create key-value pair in dict. Key is `p`, value is empty list.
    mae_grid[p] = list()
    # Inner loop: Iterate through possible values for `q`
    for q in q_params:
        # Combination of hyperparameters for model
        order = (p, 0, q)
        # Note start time
        start_time = time.time()
        # Train model
        model = ARIMA(y_train, order=order).fit()
        # Calculate model training time
        elapsed_time = round(time.time() - start_time, 2)
        print(f"Trained ARIMA {order} in {elapsed_time} seconds.")
        # Generate in-sample (training) predictions
        y_pred = model.predict()
        # Calculate training MAE
        mae = mean_absolute_error(y_train, y_pred)    # Every time the model is trained, the mean absolute error is calculated and then saved to a dictionary
        # Append MAE to list in dictionary
        mae_grid[p].append(mae)

print()
print(mae_grid)

Trained ARIMA (0, 0, 0) in 0.29 seconds.
Trained ARIMA (0, 0, 1) in 0.38 seconds.
Trained ARIMA (0, 0, 2) in 1.19 seconds.
Trained ARIMA (8, 0, 0) in 9.4 seconds.
Trained ARIMA (8, 0, 1) in 37.39 seconds.
Trained ARIMA (8, 0, 2) in 69.8 seconds.
Trained ARIMA (16, 0, 0) in 38.31 seconds.
Trained ARIMA (16, 0, 1) in 125.51 seconds.
Trained ARIMA (16, 0, 2) in 194.2 seconds.
Trained ARIMA (24, 0, 0) in 97.89 seconds.
Trained ARIMA (24, 0, 1) in 131.2 seconds.
Trained ARIMA (24, 0, 2) in 251.3 seconds.

{0: [4.171460443827197, 3.3506427433555537, 3.1057222587888647], 8: [2.9384480570404223, 2.9149008203151663, 2.908046094677133], 16: [2.9201084726122, 2.929436207635203, 2.913638894139686], 24: [2.914390327277196, 2.9136013252035013, 2.89792511429827]}


mae_grid

{0: [4.171460443827197, 3.3506427433555537, 3.1057222587888647],
 8: [2.9384480570404223, 2.9149008203151663, 2.908046094677133],
 16: [2.9201084726122, 2.929436207635203, 2.913638894139686],
 24: [2.914390327277196, 2.9136013252035013, 2.89792511429827]}


# Organize all the MAE's from above in a DataFrame names mae_df
mae_df = pd.DataFrame(mae_grid)
mae_df.round(4)


# Create heatmap of the values in mae_grid
sns.heatmap(mae_df, cmap="Blues")
plt.xlabel("p values")  # Label x-axis "p values"
plt.ylabel("q values")  # Label y-axis "q values"
plt.title("ARMA Grid Search (Criterion: MAE)");


# Use the plot_diagnostics method to check the residuals for my model
fig, ax = plt.subplots(figsize=(15, 12)) 
model.plot_diagnostics(fig=fig);       # Plot will represent the residuals from the last model I trained


# Perform walk-forward validation for my model for the entire test set y_test
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
    model = ARIMA(history, order=(8,0,1)).fit()
    next_pred = model.forecast()
    y_pred_wfv = y_pred_wfv.append(next_pred)   # Store my model's predictions in the Series y_pred_wfv
    history = history.append(y_test[next_pred.index])


test_mae = mean_absolute_error(y_test, y_pred_wfv)
print("Test MAE (walk forward validation):", round(test_mae, 2))

Test MAE (walk forward validation): 1.67


# Generate the list of training predictions for my model
df_predictions = pd.DataFrame({"y_test": y_test, "y_pred_wfv": y_pred_wfv})
fig = px.line(df_predictions, labels={"value": "PM2.5"})
fig.show()

Prepare Data¶

Import¶

Explore¶

Split¶

Build Model¶

Baseline¶

Iterate¶

Evaluate¶

Communicate Results¶

	0	8	16	24
0	4.1715	2.9384	2.9201	2.9144
1	3.3506	2.9149	2.9294	2.9136
2	3.1057	2.9080	2.9136	2.8979