Air Quality in Dar es Salaam 🇹🇿
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
import inspect
import time
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from pprint import pp
from statsmodels.tsa.ar_model import AutoReg
warnings.filterwarnings("ignore")
# Connect to MongoDB server running at host "localhost" on port 27017
client = MongoClient(host="localhost", port=27017)
#db = client["air-quality"] # Connect to the "air-quality" database
db
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'air-quality')
for c in db.list_collections():
print(c['name'])
# Assign the collection for Dar es Salaam to the variable name dar
dar = db['dar-es-salaam']
# Determine the numbers assigned to all the sensor sites in the Dar es Salaam collection
sites = dar.distinct("metadata.site")
sites
[11, 23]
# Determine which site in the Dar es Salaam collection has the most sensor readings (of any type, not just PM2.5 readings)
result = dar.aggregate(
[
{"$group":{"_id":"$metadata.site", "count":{"$count":{}}}}
])
readings_per_site = list(result) # readings_per_site should be a list of dictionaries
readings_per_site
[{'_id': 11, 'count': 138412}, {'_id': 23, 'count': 60020}]
I need to create a wrangle function that will extract the PM2.5 readings from the site that has the most total readings in the Dar es Salaam collection. My function should do the following steps:
"Africa/Dar_es_Salaam"
.y
.results = dar.find_one({})
pp(results)
{'timestamp': datetime.datetime(2018, 1, 1, 0, 0, 48, 41000), 'metadata': {'lat': -6.818, 'lon': 39.285, 'measurement': 'temperature', 'sensor_id': 34, 'sensor_type': 'DHT22', 'site': 11}, '_id': ObjectId('62a1511b5e1eb9135975e7d8'), 'temperature': 30.1}
def wrangle(collection):
results = collection.find(
{"metadata.site":11, "metadata.measurement": "P2"},
projection={"P2": 1, "timestamp": 1, "_id": 0},
)
# Read results into DataFrame
df = pd.DataFrame(list(results)).set_index("timestamp")
# Localize timezone
df.index = df.index.tz_localize("UTC").tz_convert("Africa/Dar_es_Salaam")
# Remove outliers
df = df[df["P2"] < 100]
# Resample and forward-fill
y = df["P2"].resample("1H").mean().fillna(method="ffill")
return y
Use my wrangle
function to query the dar
collection and return my cleaned results.
y = wrangle(dar)
y.head()
timestamp 2018-01-01 03:00:00+03:00 9.456327 2018-01-01 04:00:00+03:00 9.400833 2018-01-01 05:00:00+03:00 9.331458 2018-01-01 06:00:00+03:00 9.528776 2018-01-01 07:00:00+03:00 8.861250 Freq: H, Name: P2, dtype: float64
# Create a time series plot of the readings in y
fig, ax = plt.subplots(figsize=(15, 6))
y.plot(xlabel="Date", ylabel="PM2.5 Level",title = "Dar es Salaam PM2.5 Levels", ax=ax); # Label x-axis "Date", y-axis "PM2.5 Level", & title "Dar es Salaam PM2.5 Levels"
# Plot the rolling average of the readings in y
fig, ax = plt.subplots(figsize=(15, 6))
y.rolling(168).mean().plot(xlabel="Date", ylabel="PM2.5 Level", title="Dar es Salaam PM2.5 Levels, 7-Day Rolling Average", ax=ax); # Use a window size of 168 (the number of hours in a week)
# Create an ACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y, ax=ax)
plt.xlabel("Lag [hours]") # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient") # Label the y-axis as "Correlation Coefficient"
plt.title("Dar es Salaam PM2.5 Readings, ACF"); # Title "Dar es Salaam PM2.5 Readings, ACF"
# Create an PACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_pacf(y, ax=ax)
plt.xlabel("Lag [hours]") # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient") # Label the y-axis as "Correlation Coefficient"
plt.title("Dar es Salaam PM2.5 Readings, PACF"); # Title "Dar es Salaam PM2.5 Readings, PACF"
# Split y into training and test sets
cutoff_test = int(len(y) * 0.90)
y_train = y.iloc[:cutoff_test] # The first 90% of the data should be in my training set
y_test = y.iloc[cutoff_test:] # The remaining 10% should be in the test set
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
y_train shape: (1533,) y_test shape: (171,)
# Establish the baseline mean absolute error for my model
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Mean P2 Reading:", y_train_mean)
print("Baseline MAE:", mae_baseline)
Mean P2 Reading: 8.617582545265433 Baseline MAE: 4.07658759405218
# Use a for loop to train my AR model on using settings for p from 1 to 30
p_params = range(1, 31)
maes = []
for p in p_params:
# Note start time
start_time = time.time()
# Train model
model = AutoReg(y_train, lags=p).fit()
# Calculate model training time
elapsed_time = round(time.time() - start_time, 2)
print(f"Trained AR {p} in {elapsed_time} seconds.")
# Generate in-sample(training predictions)
y_pred = model.predict()
# Calculate in-sample (training) predictions
mae = mean_absolute_error(y_train.iloc[p:], y_pred.iloc[p:]) # Each time I train a new model, calculate its mean absolute error and append the result to the list maes
# Append MAE to list in dictionary
maes.append(mae)
mae_series = pd.Series(maes, name="mae", index=p_params) # Store my results in the Series mae_series
mae_series.head()
Trained AR 1 in 0.0 seconds. Trained AR 2 in 0.0 seconds. Trained AR 3 in 0.0 seconds. Trained AR 4 in 0.0 seconds. Trained AR 5 in 0.0 seconds. Trained AR 6 in 0.11 seconds. Trained AR 7 in 0.1 seconds. Trained AR 8 in 0.2 seconds. Trained AR 9 in 0.01 seconds. Trained AR 10 in 0.01 seconds. Trained AR 11 in 0.1 seconds. Trained AR 12 in 0.2 seconds. Trained AR 13 in 0.4 seconds. Trained AR 14 in 0.01 seconds. Trained AR 15 in 0.2 seconds. Trained AR 16 in 0.4 seconds. Trained AR 17 in 0.2 seconds. Trained AR 18 in 0.2 seconds. Trained AR 19 in 0.1 seconds. Trained AR 20 in 0.3 seconds. Trained AR 21 in 0.6 seconds. Trained AR 22 in 0.3 seconds. Trained AR 23 in 0.9 seconds. Trained AR 24 in 0.5 seconds. Trained AR 25 in 0.7 seconds. Trained AR 26 in 0.6 seconds. Trained AR 27 in 0.6 seconds. Trained AR 28 in 0.3 seconds. Trained AR 29 in 0.4 seconds. Trained AR 30 in 0.6 seconds.
1 0.947888 2 0.933894 3 0.920850 4 0.920153 5 0.919519 Name: mae, dtype: float64
# Look through the results in mae_series and determine what value for p provides the best performance
mae_series.idxmax()
1
# Build and train final_model using the best hyperparameter value
best_p = mae_series.idxmax()
best_model = AutoReg(y_train, lags=best_p).fit()
# Calculate the training residuals for best_model and assign the result to y_train_resid
y_train_resid = best_model.resid
y_train_resid.name = "residuals" # The name of my Series should be "residuals"
y_train_resid.head()
timestamp 2018-01-01 04:00:00+03:00 0.003846 2018-01-01 05:00:00+03:00 -0.013904 2018-01-01 06:00:00+03:00 0.247951 2018-01-01 07:00:00+03:00 -0.603135 2018-01-01 08:00:00+03:00 -0.509025 Freq: H, Name: residuals, dtype: float64
# Create a histogram of y_train_resid
y_train_resid.hist()
plt.xlabel("Residuals") # Label the x-axis as "Residuals"
plt.ylabel("Frequency") # Label the y-axis as "Frequency"
plt.title("Best Model, Training Residuals"); # Title as "Best Model, Training Residuals"
# Create an ACF plot for y_train_resid
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y_train_resid, ax=ax)
plt.xlabel("Lag [hours]") # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient") # Label the y-axis as "Correlation Coefficient"
plt.title("Dar es Salaam PM2.5 Readings, ACF"); # Title "Dar es Salaam, Training Residuals ACF"
# Perform walk-forward validation for my model for the entire test set y_test
y_pred_wfv = pd.Series(dtype="float64")
history = y_train.copy()
for i in range(len(y_test)):
model = AutoReg(history, lags=28).fit()
next_pred = model.forecast()
y_pred_wfv = y_pred_wfv.append(next_pred) # Store my model's predictions in the Series y_pred_wfv
history = history.append(y_test[next_pred.index])
y_pred_wfv.name = "prediction" # The name of my Series is "prediction"
y_pred_wfv.index.name = "timestamp" # The name of my Series index is "timestamp"
y_pred_wfv.head()
timestamp 2018-03-06 00:00:00+03:00 8.056391 2018-03-06 01:00:00+03:00 8.681779 2018-03-06 02:00:00+03:00 6.268951 2018-03-06 03:00:00+03:00 6.303760 2018-03-06 04:00:00+03:00 7.171444 Freq: H, Name: prediction, dtype: float64
# Put the values for y_test and y_pred_wfv into the DataFrame df_pred_test
df_pred_test = pd.DataFrame(
{
"y_test": y_test, "y_pred_wfv": y_pred_wfv
}
)
# Plot df_pred_test using plotly express
fig = px.line(df_pred_test)
fig.update_layout(
title="Dar es Salaam, WFV Predictions", # Title "Dar es Salaam, WFV Predictions"
xaxis_title="Date", # Label the x-axis as "Date"
yaxis_title="PM2.5 Level", # Label the y-axis as "PM2.5 Level"
)