__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import pytz
from pymongo import MongoClient
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error


# Create a client to connect to the MongoDB server
client = MongoClient(host="localhost", port=27017) # Connect with the server
db = client["air-quality"] # Name of my DataBase
nairobi = db["nairobi"]


# Create wrangle function so that the results from the database query are read into the DataFrame df

def wrangle(collection):  # Pass in the name of the collection
    results = collection.find(   # Query that collection 
        {"metadata.site": 29, "metadata.measurement": "P2"}, # Looking for readings from site 29, readings that are P2 readings
        projection={"P2": 1, "timestamp": 1, "_id": 0}, # Return P2 readings and the timestamp associated with that reading
    )

    df = pd.DataFrame(results).set_index("timestamp") # Take the results and put them into a DataFrame
    
    return df


# Use wrangle function to read the data from the nairobi collection into the DataFrame df
df = wrangle(nairobi) # Use wrangle to pass in the collection (nairobi (the object, not the variable))
df.head(10)


df.index[:5]

DatetimeIndex(['2018-09-01 00:00:02.472000', '2018-09-01 00:05:03.941000',
               '2018-09-01 00:10:04.374000', '2018-09-01 00:15:04.245000',
               '2018-09-01 00:20:04.869000'],
              dtype='datetime64[ns]', name='timestamp', freq=None)


df.index.tz_localize("UTC")[:5]

DatetimeIndex(['2018-09-01 00:00:02.472000+00:00',
               '2018-09-01 00:05:03.941000+00:00',
               '2018-09-01 00:10:04.374000+00:00',
               '2018-09-01 00:15:04.245000+00:00',
               '2018-09-01 00:20:04.869000+00:00'],
              dtype='datetime64[ns, UTC]', name='timestamp', freq=None)


df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")[:5]

DatetimeIndex(['2018-09-01 03:00:02.472000+03:00',
               '2018-09-01 03:05:03.941000+03:00',
               '2018-09-01 03:10:04.374000+03:00',
               '2018-09-01 03:15:04.245000+03:00',
               '2018-09-01 03:20:04.869000+03:00'],
              dtype='datetime64[ns, Africa/Nairobi]', name='timestamp', freq=None)


# Add to wrangle function so that the DatetimeIndex for df is localized to the correct timezone, "Africa/Nairobi"

def wrangle(collection):  # Pass in the name of the collection
    results = collection.find(   # Query that collection 
        {"metadata.site": 29, "metadata.measurement": "P2"}, # Looking for readings from site 29, readings that are P2 readings
        projection={"P2": 1, "timestamp": 1, "_id": 0},  # Return P2 readings and the timestamp associated with that reading
    )

    df = pd.DataFrame(results).set_index("timestamp")   # Take the results and put them into a DataFrame
    
    # Localize Timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")
    
    return df


df = wrangle(nairobi)    # Use wrangle to pass in the collection (nairobi (the object, not the variable))
print(df.shape)
df.head(10)

(32907, 1)


# Create a boxplot of the "P2" readings in df
fig, ax = plt.subplots(figsize=(15, 6))
df["P2"].plot(kind="box", vert=False, title="Distribution of PM2.5 Readings", ax=ax);


print(df.shape)
df.head(10)

(32907, 1)


# Add to wrangle function so that all "P2" readings above 500 are dropped from the dataset

def wrangle(collection):  # Pass in the name of the collection
    results = collection.find(   # Query that collection 
        {"metadata.site": 29, "metadata.measurement": "P2"}, # Looking for readings from site 29, readings that are P2 readings
        projection={"P2": 1, "timestamp": 1, "_id": 0}, # Return P2 readings and the timestamp associated with that reading
    )

    df = pd.DataFrame(results).set_index("timestamp")             # Take the results and put them into a DataFrame
    
    # Localize Timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")
    
    # Remove outliers 
    df = df[df["P2"] < 500]  # Give me all rows of df where column "P2" is less than 500
    
    return df


df = wrangle(nairobi)    # Use wrangle to pass in the collection (nairobi (the object, not the variable))
print(df.shape)
df.head(10)

(32906, 1)


# Create a time series plot of the "P2" readings in df
fig, ax = plt.subplots(figsize=(15, 6))
df["P2"].plot(xlabel="Time", ylabel="PM2.5", title="PM2.5 Time Series", ax=ax);


df.head()


# Resample 
df["P2"].resample("1H") # "P2" column is what we're interested in

<pandas.core.resample.DatetimeIndexResampler object at 0x7fd2743dc1c0>


df["P2"].resample("1H").mean().head()

timestamp
2018-09-01 03:00:00+03:00    17.541667
2018-09-01 04:00:00+03:00    15.800000
2018-09-01 05:00:00+03:00    11.420000
2018-09-01 06:00:00+03:00    11.614167
2018-09-01 07:00:00+03:00    17.665000
Freq: H, Name: P2, dtype: float64


df["P2"].resample("1H").mean().isnull()

timestamp
2018-09-01 03:00:00+03:00    False
2018-09-01 04:00:00+03:00    False
2018-09-01 05:00:00+03:00    False
2018-09-01 06:00:00+03:00    False
2018-09-01 07:00:00+03:00    False
                             ...  
2018-12-31 22:00:00+03:00    False
2018-12-31 23:00:00+03:00    False
2019-01-01 00:00:00+03:00    False
2019-01-01 01:00:00+03:00    False
2019-01-01 02:00:00+03:00    False
Freq: H, Name: P2, Length: 2928, dtype: bool


df["P2"].resample("1H").mean().isnull().sum()

102


# Fill null values 
df["P2"].resample("1H").mean().fillna(method="ffill").head()

timestamp
2018-09-01 03:00:00+03:00    17.541667
2018-09-01 04:00:00+03:00    15.800000
2018-09-01 05:00:00+03:00    11.420000
2018-09-01 06:00:00+03:00    11.614167
2018-09-01 07:00:00+03:00    17.665000
Freq: H, Name: P2, dtype: float64


df["P2"].resample("1H").mean().fillna(method="ffill").isnull().sum()

0


df["P2"].resample("1H").mean().fillna(method="ffill").to_frame().head()


# Add to my wrangle function to resample df to provide the mean "P2" reading for each hour

def wrangle(collection):  # Pass in the name of the collection
    results = collection.find(   # Query that collection 
        {"metadata.site": 29, "metadata.measurement": "P2"}, # Looking for readings from site 29, readings that are P2 readings
        projection={"P2": 1, "timestamp": 1, "_id": 0}, # Return P2 readings and the timestamp associated with that reading
    )

    df = pd.DataFrame(results).set_index("timestamp")             # Take the results and put them into a DataFrame
    
    # Localize Timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")
    
    # Remove outliers 
    df = df[df["P2"] < 500]  # Give me all rows of df where column "P2" is less than 500
    
    # Resample to 1H window, ffill missing values
    df = df["P2"].resample("1H").mean().fillna(method="ffill").to_frame()
    
    return df


df = wrangle(nairobi)    # Use wrangle to pass in the collection (nairobi (the object, not the variable))
print(df.shape)
df.head(10)

(2928, 1)


df["P2"].rolling(168)

Rolling [window=168,center=False,axis=0,method=single]


df["P2"].rolling(168).mean().head()

timestamp
2018-09-01 03:00:00+03:00   NaN
2018-09-01 04:00:00+03:00   NaN
2018-09-01 05:00:00+03:00   NaN
2018-09-01 06:00:00+03:00   NaN
2018-09-01 07:00:00+03:00   NaN
Freq: H, Name: P2, dtype: float64


df["P2"].rolling(168).mean().tail()

timestamp
2018-12-31 22:00:00+03:00    6.932995
2018-12-31 23:00:00+03:00    6.935927
2019-01-01 00:00:00+03:00    6.938348
2019-01-01 01:00:00+03:00    6.973928
2019-01-01 02:00:00+03:00    7.043333
Freq: H, Name: P2, dtype: float64


df["P2"].rolling(168).mean().isnull().sum()

167


# Plot the rolling average of the "P2" readings in df
fig, ax = plt.subplots(figsize=(15, 6))
df["P2"].rolling(168).mean().plot(ax=ax, ylabel="PM2.5", title="Weekly Rolling Average"); # Use a window size of 168 (the number of hours in a week)


df["P2"].head()

timestamp
2018-09-01 03:00:00+03:00    17.541667
2018-09-01 04:00:00+03:00    15.800000
2018-09-01 05:00:00+03:00    11.420000
2018-09-01 06:00:00+03:00    11.614167
2018-09-01 07:00:00+03:00    17.665000
Freq: H, Name: P2, dtype: float64


df["P2"].shift(1).head()

timestamp
2018-09-01 03:00:00+03:00          NaN
2018-09-01 04:00:00+03:00    17.541667
2018-09-01 05:00:00+03:00    15.800000
2018-09-01 06:00:00+03:00    11.420000
2018-09-01 07:00:00+03:00    11.614167
Freq: H, Name: P2, dtype: float64


df["P2.L1"] = df["P2"].shift(1)
df.head()


df["P2.L1"] = df["P2"].shift(1)
df.dropna().head()


# Add to my wrangle function to create a column called "P2.L1" that contains the mean"P2" reading from the previous hour

def wrangle(collection):  # Pass in the name of the collection
    results = collection.find(   # Query that collection 
        {"metadata.site": 29, "metadata.measurement": "P2"}, # Looking for readings from site 29, readings that are P2 readings
        projection={"P2": 1, "timestamp": 1, "_id": 0}, # Return P2 readings and the timestamp associated with that reading
    )

    df = pd.DataFrame(results).set_index("timestamp")             # Take the results and put them into a DataFrame
    
    # Localize Timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")
    
    # Remove outliers 
    df = df[df["P2"] < 500]  # Give me all rows of df where column "P2" is less than 500
    
    # Resample to 1H window, ffill missing values
    df = df["P2"].resample("1H").mean().fillna(method="ffill").to_frame()
    
    # Add lag feature
    df["P2.L1"] = df["P2"].shift(1)
    
    # Drop NaN rows
    df.dropna(inplace=True)
    
    return df


df = wrangle(nairobi)
print(df.shape)
df.head(10)

(2927, 2)


# Create a correlation matrix for df
df.corr()


# Create a scatter plot that shows PM 2.5 mean reading for each our as a function of the mean reading from the previous hour
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(x=df["P2.L1"], y=df["P2"])
ax.plot([0, 120], [0, 120], linestyle="--", color="orange")
plt.xlabel("P2.L1") # "P2.L1" should be on the x-axis
plt.ylabel("P2") # "P2" should be on the y-axis
plt.title("PM2.5 Autocorrelation");


target = "P2"
y = df[target]
X = ...


y.head()

timestamp
2018-09-01 04:00:00+03:00    15.800000
2018-09-01 05:00:00+03:00    11.420000
2018-09-01 06:00:00+03:00    11.614167
2018-09-01 07:00:00+03:00    17.665000
2018-09-01 08:00:00+03:00    21.016667
Freq: H, Name: P2, dtype: float64


# Split the DataFrame df into the feature matrix X and the target vector y
target = "P2"
y = df[target]
X = df.drop(columns=target)


X.head()


len(X) * 0.8

2341.6


int(len(X) * 0.8)

2341


cutoff = int(len(X) * 0.8)


len(X.iloc[:cutoff])

2341


# Split X and y into training and test sets
cutoff = int(len(X) * 0.8)

X_train, y_train = X.iloc[:cutoff], y.iloc[:cutoff] # The first 80% of the data should be in my training set
X_test, y_test = X.iloc[cutoff:], y.iloc[cutoff:] # The remaining 20% should be in the test set


len(X_train)

2341


len(X_train)+len(X_test)

2927


len(X_train)+len(X_test) == len(X)

True


# Calculate the baseline mean absolute error for my model
y_pred_baseline = [y_train.mean()] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean P2 Reading:", round(y_train.mean(), 2))
print("Baseline MAE:", round(mae_baseline, 2))

Mean P2 Reading: 9.27
Baseline MAE: 3.89


# Instantiate a LinearRegression model named "model"
model = LinearRegression()
model.fit(X_train, y_train) # Fit it to my training data

LinearRegression()

LinearRegression()


# Calculate the training and test mean absolute error for my model
training_mae = mean_absolute_error(y_train, model.predict(X_train))
test_mae = mean_absolute_error(y_test, model.predict(X_test))
print("Training MAE:", round(training_mae, 2))
print("Test MAE:", round(test_mae, 2))

Training MAE: 2.46
Test MAE: 1.8


# Extract the intercept and coefficient from my model
intercept = model.intercept_.round(2)
coefficient = model.coef_.round(2)[0]

print(f"P2 = {intercept} + ({coefficient} * P2.L1)")

P2 = 3.36 + (0.64 * P2.L1)


# Create a DataFrame df_pred_test that has two columns: "y_test" and "y_pred"
df_pred_test = pd.DataFrame(    
    {
        "y_test": y_test,                # The first should contain the true values for my test set
        "y_pred": model.predict(X_test)  # The second should contain my model's predictions
    }
)
df_pred_test.head()


# Create a time series line plot for the values in test_predictions using plotly express
fig = px.line(df_pred_test, labels={"value":"P2"})    # Be sure that the y-axis is properly labeled as "P2"
fig.show()

	P2
timestamp
2018-09-01 00:00:02.472	34.43
2018-09-01 00:05:03.941	30.53
2018-09-01 00:10:04.374	22.80
2018-09-01 00:15:04.245	13.30
2018-09-01 00:20:04.869	16.57
2018-09-01 00:25:04.659	14.07
2018-09-01 00:30:05.406	16.33
2018-09-01 00:35:04.259	16.50
2018-09-01 00:40:04.219	12.43
2018-09-01 00:45:04.042	11.07

	P2
timestamp
2018-09-01 03:00:02.472000+03:00	34.43
2018-09-01 03:05:03.941000+03:00	30.53
2018-09-01 03:10:04.374000+03:00	22.80
2018-09-01 03:15:04.245000+03:00	13.30
2018-09-01 03:20:04.869000+03:00	16.57
2018-09-01 03:25:04.659000+03:00	14.07
2018-09-01 03:30:05.406000+03:00	16.33
2018-09-01 03:35:04.259000+03:00	16.50
2018-09-01 03:40:04.219000+03:00	12.43
2018-09-01 03:45:04.042000+03:00	11.07

	P2
timestamp
2018-09-01 03:00:02.472000+03:00	34.43
2018-09-01 03:05:03.941000+03:00	30.53
2018-09-01 03:10:04.374000+03:00	22.80
2018-09-01 03:15:04.245000+03:00	13.30
2018-09-01 03:20:04.869000+03:00	16.57
2018-09-01 03:25:04.659000+03:00	14.07
2018-09-01 03:30:05.406000+03:00	16.33
2018-09-01 03:35:04.259000+03:00	16.50
2018-09-01 03:40:04.219000+03:00	12.43
2018-09-01 03:45:04.042000+03:00	11.07

	P2
timestamp
2018-09-01 03:00:02.472000+03:00	34.43
2018-09-01 03:05:03.941000+03:00	30.53
2018-09-01 03:10:04.374000+03:00	22.80
2018-09-01 03:15:04.245000+03:00	13.30
2018-09-01 03:20:04.869000+03:00	16.57
2018-09-01 03:25:04.659000+03:00	14.07
2018-09-01 03:30:05.406000+03:00	16.33
2018-09-01 03:35:04.259000+03:00	16.50
2018-09-01 03:40:04.219000+03:00	12.43
2018-09-01 03:45:04.042000+03:00	11.07

	y_test	y_pred
timestamp
2018-12-07 17:00:00+03:00	7.070000	8.478927
2018-12-07 18:00:00+03:00	8.968333	7.865485
2018-12-07 19:00:00+03:00	11.630833	9.076421
2018-12-07 20:00:00+03:00	11.525833	10.774814
2018-12-07 21:00:00+03:00	9.533333	10.707836

Prepare Data¶

Import¶

Explore¶

Split¶

Build Model¶

Baseline¶

Iterate¶

Evaluate¶

Communicate Results¶

	P2
timestamp
2018-09-01 03:00:00+03:00	17.541667
2018-09-01 04:00:00+03:00	15.800000
2018-09-01 05:00:00+03:00	11.420000
2018-09-01 06:00:00+03:00	11.614167
2018-09-01 07:00:00+03:00	17.665000

	P2	P2.L1
P2	1.000000	0.650679
P2.L1	0.650679	1.000000