__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import warnings
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.ar_model import AutoReg

warnings.simplefilter(action="ignore", category=FutureWarning)


# Create a client to connect to the MongoDB server that assigns the "air-quality" database to db, and the "nairobi" connection to nairobi

client = MongoClient(host="localhost", port=27017)
db = client["air-quality"]
nairobi = db["nairobi"]


# Change the wrangle function so that it returns a Series of the resampled data instead of a DataFrame

def wrangle(collection):
    results = collection.find(
        {"metadata.site": 29, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},
    )

    # Read data into DataFrame
    df = pd.DataFrame(list(results)).set_index("timestamp")

    # Localize timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")

    # Remove outliers
    df = df[df["P2"] < 500]

    # Resample to 1hr window
    y = df["P2"].resample("1H").mean().fillna(method='ffill')
    return y


# Use my wrangle function to read the data from the nairobi collection into the Series y
y = wrangle(nairobi)
y.head()


y.shape

(2928,)


y.corr(y)

1.0


y.corr(y.shift(1))

0.6506785333937847


y.corr(y.shift(3))

0.31272080474576847


# Create an ACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y, ax=ax) 
plt.xlabel("Lag [hours]")               # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient");  # Label the y-axis as "Correlation Coefficient"


y.corr(y.shift(1))

0.6506785333937847


y.shift(1).corr(y.shift(2))

0.6506810900745511


# Create an PACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_pacf(y, ax=ax)
plt.xlabel("Lag [hours]")               # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient");  # Label the y-axis as "Correlation Coefficient"


cutoff_test = 

y_train =
y_test =


len(y)


int(len(y) * 0.95)


# Split y into training and test sets 

cutoff_test = int(len(y) * 0.95)      
y_train = y.iloc[:cutoff_test]         # The first 95% of the data should be in my training set
y_test = y.iloc[cutoff_test:]          # The remaining 5% should be in the test set


len(y_train) + len(y_test)


len(y_train) + len(y_test) == len(y)


# Calculate the baseline mean absolute error for my model
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean P2 Reading:", round(y_train_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))


# Instantiate an AutoReg model and fit it to the training data y_train
model = AutoReg(y_train, lags=26).fit()  # Be sure to set the lags argument to 26


model.predict()


model.predict().isnull().sum()


# Generate a list of training predictions for my model and use them to calculate my training mean absolute error
y_pred = model.predict().dropna()
training_mae = mean_absolute_error(y_train.iloc[26:], y_pred)
print("Training MAE:", training_mae)


# Use y_train and y_pred to calculate the residuals for my model
y_train_resid = y_train - y_pred
y_train_resid.tail()


y_train_resid = model.resid
y_train_resid.tail()


# Create a plot of y_train_resid
fig, ax = plt.subplots(figsize=(15, 6))
y_train_resid.plot(ylabel="Residual Value", ax=ax);


# Create a histogram of y_train_resid
y_train_resid.hist()
plt.xlabel("Residual Value")
plt.ylabel("Frequency")
plt.title("AR(26), Distribution of Residuals");


# Create an ACF plot of y_train_resid
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y_train_resid, ax=ax);


y_test.head()


y_test.tail()


y_test.index.min()


y_test.index.max()


# Calculate the test mean absolute error for my model
y_pred_test = model.predict(y_test.index.min(), y_test.index.max())
test_mae = mean_absolute_error(y_test, y_pred_test)
print("Test MAE:", test_mae)


# Create a DataFrame test_predictions that has two columns: "y_test" and "y_pred" & be sure the index of test_predictions matches the index of y_test

df_pred_test = pd.DataFrame(
    {"y_test": y_test, "y_pred": y_pred_test}, index=y_test.index # The first should contain the true values for my test set & the second should contain my model's predictions
)


# Create a time series plot for the values in test_predictions using plotly express
fig = px.line(df_pred_test, labels={"value": "P2"}) # Be sure that the y-axis is properly labeled as "P2"
fig.show()


%%capture  # Perform walk-forward validation for my model for the entire test set y_test
y_pred_wfv = pd.Series()                   # Store my model's predictions in the Series y_pred_wfv
history = y_train.copy()
for i in range(len(y_test)):
    model = AutoReg(history, lags=26).fit()
    next_pred = model.forecast()
    y_pred_wfv = y_pred_wfv.append(next_pred)
    history = history.append(y_test[next_pred.index])


len(y_pred_wfv)

147


len(y_test)

147


history.tail(1)

timestamp
2019-01-01 02:00:00+03:00    18.803333
Freq: H, Name: P2, dtype: float64


y_test.head(1)

timestamp
2018-12-26 00:00:00+03:00    5.679091
Freq: H, Name: P2, dtype: float64


model = AutoReg(history, lags=26).fit()


model.forecast()

2019-01-01 03:00:00+03:00    14.110356
Freq: H, dtype: float64


# Calculate the test mean absolute error for my model
test_mae = mean_absolute_error(y_test, y_pred_wfv)
print("Test MAE (walk forward validation):", round(test_mae, 2))

Test MAE (walk forward validation): 1.4


# Print out the parameters for my trained model
print(model.params)

intercept    2.021763
P2.L1        0.587991
P2.L2        0.019749
P2.L3        0.023426
P2.L4        0.026807
P2.L5        0.044019
P2.L6       -0.101889
P2.L7        0.029814
P2.L8        0.049894
P2.L9       -0.017513
P2.L10       0.032585
P2.L11       0.064034
P2.L12       0.005978
P2.L13       0.018366
P2.L14      -0.007634
P2.L15      -0.016571
P2.L16      -0.015587
P2.L17      -0.035295
P2.L18       0.000929
P2.L19      -0.003956
P2.L20      -0.020668
P2.L21      -0.012730
P2.L22       0.052431
P2.L23       0.074405
P2.L24      -0.024263
P2.L25       0.090515
P2.L26      -0.088646
dtype: float64


# Put the values for y_test and y_pred_wfv into the DataFrame df_pred_test

df_pred_test = pd.DataFrame(
    {"y_test": y_test, "y_pred_wfv": y_pred_wfv}
)
fig = px.line(df_pred_test, labels={"value": "PM2.5"}) # Plot df_pred_test using plotly express
fig.show()

Prepare Data¶

Import¶

Explore¶

Split¶

Build Model¶

Baseline¶

Iterate¶

Evaluate¶

Communicate Results¶