Air Quality in Dar es Salaam 🇹🇿

In [21]:
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
In [22]:
import inspect
import time
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns

from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from pprint import pp
from statsmodels.tsa.ar_model import AutoReg

warnings.filterwarnings("ignore")

Prepare Data¶

Connect¶

In [26]:
# Connect to MongoDB server running at host "localhost" on port 27017
client = MongoClient(host="localhost", port=27017)
#db = client["air-quality"]   # Connect to the "air-quality" database 
In [27]:
db
Out[27]:
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'air-quality')
In [ ]:
for c in db.list_collections():
    print(c['name'])
In [36]:
# Assign the collection for Dar es Salaam to the variable name dar
dar = db['dar-es-salaam'] 

Explore¶

In [6]:
# Determine the numbers assigned to all the sensor sites in the Dar es Salaam collection
sites = dar.distinct("metadata.site")
sites
Out[6]:
[11, 23]
In [7]:
# Determine which site in the Dar es Salaam collection has the most sensor readings (of any type, not just PM2.5 readings)
result = dar.aggregate(
        [
            {"$group":{"_id":"$metadata.site", "count":{"$count":{}}}}
        ])
readings_per_site = list(result)     # readings_per_site should be a list of dictionaries
readings_per_site
Out[7]:
[{'_id': 11, 'count': 138412}, {'_id': 23, 'count': 60020}]

Import¶

I need to create a wrangle function that will extract the PM2.5 readings from the site that has the most total readings in the Dar es Salaam collection. My function should do the following steps:

  1. Localize reading time stamps to the timezone for "Africa/Dar_es_Salaam".
  2. Remove all outlier PM2.5 readings that are above 100.
  3. Resample the data to provide the mean PM2.5 reading for each hour.
  4. Impute any missing values using the forward-will method.
  5. Return a Series y.
In [8]:
results = dar.find_one({})
pp(results)
{'timestamp': datetime.datetime(2018, 1, 1, 0, 0, 48, 41000),
 'metadata': {'lat': -6.818,
              'lon': 39.285,
              'measurement': 'temperature',
              'sensor_id': 34,
              'sensor_type': 'DHT22',
              'site': 11},
 '_id': ObjectId('62a1511b5e1eb9135975e7d8'),
 'temperature': 30.1}
In [9]:
def wrangle(collection):
    
    results = collection.find(
        {"metadata.site":11, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},
    )
    
    # Read results into DataFrame
    df = pd.DataFrame(list(results)).set_index("timestamp")

    # Localize timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Dar_es_Salaam")

    # Remove outliers
    df = df[df["P2"] < 100]

    # Resample and forward-fill
    y = df["P2"].resample("1H").mean().fillna(method="ffill")
    
    return y

Use my wrangle function to query the dar collection and return my cleaned results.

In [10]:
y = wrangle(dar)
y.head()
Out[10]:
timestamp
2018-01-01 03:00:00+03:00    9.456327
2018-01-01 04:00:00+03:00    9.400833
2018-01-01 05:00:00+03:00    9.331458
2018-01-01 06:00:00+03:00    9.528776
2018-01-01 07:00:00+03:00    8.861250
Freq: H, Name: P2, dtype: float64

Explore Some More¶

In [11]:
# Create a time series plot of the readings in y
fig, ax = plt.subplots(figsize=(15, 6))
y.plot(xlabel="Date", ylabel="PM2.5 Level",title = "Dar es Salaam PM2.5 Levels", ax=ax); # Label x-axis "Date", y-axis "PM2.5 Level", &  title "Dar es Salaam PM2.5 Levels"
In [12]:
# Plot the rolling average of the readings in y
fig, ax = plt.subplots(figsize=(15, 6))
y.rolling(168).mean().plot(xlabel="Date", ylabel="PM2.5 Level", title="Dar es Salaam PM2.5 Levels, 7-Day Rolling Average", ax=ax);  # Use a window size of 168 (the number of hours in a week)
In [13]:
# Create an ACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y, ax=ax)
plt.xlabel("Lag [hours]")             # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient") # Label the y-axis as "Correlation Coefficient"
plt.title("Dar es Salaam PM2.5 Readings, ACF"); # Title "Dar es Salaam PM2.5 Readings, ACF"
In [14]:
# Create an PACF plot for the data in y
fig, ax = plt.subplots(figsize=(15, 6))
plot_pacf(y, ax=ax)
plt.xlabel("Lag [hours]")              # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient")  # Label the y-axis as "Correlation Coefficient"
plt.title("Dar es Salaam PM2.5 Readings, PACF"); # Title "Dar es Salaam PM2.5 Readings, PACF"

Split¶

In [15]:
# Split y into training and test sets
cutoff_test = int(len(y) * 0.90)
y_train = y.iloc[:cutoff_test]    # The first 90% of the data should be in my training set
y_test = y.iloc[cutoff_test:]     # The remaining 10% should be in the test set
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
y_train shape: (1533,)
y_test shape: (171,)

Build Model¶

Baseline¶

In [16]:
# Establish the baseline mean absolute error for my model
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean P2 Reading:", y_train_mean)
print("Baseline MAE:", mae_baseline)
Mean P2 Reading: 8.617582545265433
Baseline MAE: 4.07658759405218

Iterate¶

In [17]:
# Use a for loop to train my AR model on using settings for p from 1 to 30

p_params = range(1, 31)
maes = []
for p in p_params:
    # Note start time
    start_time = time.time()
    # Train model
    model = AutoReg(y_train, lags=p).fit()
    # Calculate model training time
    elapsed_time = round(time.time() - start_time, 2)
    print(f"Trained AR {p} in {elapsed_time} seconds.")
    # Generate in-sample(training predictions)
    y_pred = model.predict()
    # Calculate in-sample (training) predictions
    mae = mean_absolute_error(y_train.iloc[p:], y_pred.iloc[p:])   # Each time I train a new model, calculate its mean absolute error and append the result to the list maes
    # Append MAE to list in dictionary
    maes.append(mae)
    
     
mae_series = pd.Series(maes, name="mae", index=p_params)   # Store my  results in the Series mae_series
mae_series.head()
Trained AR 1 in 0.0 seconds.
Trained AR 2 in 0.0 seconds.
Trained AR 3 in 0.0 seconds.
Trained AR 4 in 0.0 seconds.
Trained AR 5 in 0.0 seconds.
Trained AR 6 in 0.11 seconds.
Trained AR 7 in 0.1 seconds.
Trained AR 8 in 0.2 seconds.
Trained AR 9 in 0.01 seconds.
Trained AR 10 in 0.01 seconds.
Trained AR 11 in 0.1 seconds.
Trained AR 12 in 0.2 seconds.
Trained AR 13 in 0.4 seconds.
Trained AR 14 in 0.01 seconds.
Trained AR 15 in 0.2 seconds.
Trained AR 16 in 0.4 seconds.
Trained AR 17 in 0.2 seconds.
Trained AR 18 in 0.2 seconds.
Trained AR 19 in 0.1 seconds.
Trained AR 20 in 0.3 seconds.
Trained AR 21 in 0.6 seconds.
Trained AR 22 in 0.3 seconds.
Trained AR 23 in 0.9 seconds.
Trained AR 24 in 0.5 seconds.
Trained AR 25 in 0.7 seconds.
Trained AR 26 in 0.6 seconds.
Trained AR 27 in 0.6 seconds.
Trained AR 28 in 0.3 seconds.
Trained AR 29 in 0.4 seconds.
Trained AR 30 in 0.6 seconds.
Out[17]:
1    0.947888
2    0.933894
3    0.920850
4    0.920153
5    0.919519
Name: mae, dtype: float64
In [18]:
# Look through the results in mae_series and determine what value for p provides the best performance
mae_series.idxmax()
Out[18]:
1
In [19]:
# Build and train final_model using the best hyperparameter value
best_p = mae_series.idxmax()
best_model = AutoReg(y_train, lags=best_p).fit()
In [20]:
# Calculate the training residuals for best_model and assign the result to y_train_resid
y_train_resid = best_model.resid
y_train_resid.name = "residuals"   # The name of my Series should be "residuals"
y_train_resid.head()
Out[20]:
timestamp
2018-01-01 04:00:00+03:00    0.003846
2018-01-01 05:00:00+03:00   -0.013904
2018-01-01 06:00:00+03:00    0.247951
2018-01-01 07:00:00+03:00   -0.603135
2018-01-01 08:00:00+03:00   -0.509025
Freq: H, Name: residuals, dtype: float64
In [21]:
# Create a histogram of y_train_resid
y_train_resid.hist()
plt.xlabel("Residuals") # Label the x-axis as "Residuals"
plt.ylabel("Frequency") # Label the y-axis as "Frequency"
plt.title("Best Model, Training Residuals"); # Title as "Best Model, Training Residuals"
In [22]:
# Create an ACF plot for y_train_resid
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y_train_resid, ax=ax)
plt.xlabel("Lag [hours]")               # Label the x-axis as "Lag [hours]"
plt.ylabel("Correlation Coefficient")   # Label the y-axis as "Correlation Coefficient"
plt.title("Dar es Salaam PM2.5 Readings, ACF");  #  Title "Dar es Salaam, Training Residuals ACF"

Evaluate¶

In [23]:
# Perform walk-forward validation for my model for the entire test set y_test
y_pred_wfv = pd.Series(dtype="float64")
history = y_train.copy()
for i in range(len(y_test)):
    model = AutoReg(history, lags=28).fit()
    next_pred = model.forecast()
    y_pred_wfv = y_pred_wfv.append(next_pred)        # Store my model's predictions in the Series y_pred_wfv
    history = history.append(y_test[next_pred.index])
y_pred_wfv.name = "prediction"                       # The name of my Series is "prediction"
y_pred_wfv.index.name = "timestamp"                  # The name of my Series index is "timestamp"
y_pred_wfv.head()
Out[23]:
timestamp
2018-03-06 00:00:00+03:00    8.056391
2018-03-06 01:00:00+03:00    8.681779
2018-03-06 02:00:00+03:00    6.268951
2018-03-06 03:00:00+03:00    6.303760
2018-03-06 04:00:00+03:00    7.171444
Freq: H, Name: prediction, dtype: float64

Communicate Results¶

In [24]:
# Put the values for y_test and y_pred_wfv into the DataFrame df_pred_test
df_pred_test = pd.DataFrame(
    {
        "y_test": y_test, "y_pred_wfv": y_pred_wfv
    }
)

# Plot df_pred_test using plotly express
fig = px.line(df_pred_test)
fig.update_layout(
    title="Dar es Salaam, WFV Predictions", # Title "Dar es Salaam, WFV Predictions"
    xaxis_title="Date",                     # Label the x-axis as "Date" 
    yaxis_title="PM2.5 Level",              # Label the y-axis as "PM2.5 Level"
)