Linear Regression with Time Series Data 🇰🇪
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import pytz
from pymongo import MongoClient
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
I've been able to get data from MongoDB database. Now I can build a model with it.
GOALS
Machine Learning Workflow
I. Prepare Data
II. Build Model
III. Communicate Results
# Create a client to connect to the MongoDB server
client = MongoClient(host="localhost", port=27017) # Connect with the server
db = client["air-quality"] # Name of my DataBase
nairobi = db["nairobi"]
I've got a connection to my collection, and now I want to build a wrangle function that will get data from that collection and do some cleaning steps. I'm going to build this wrangle function iteratively.
# Create wrangle function so that the results from the database query are read into the DataFrame df
def wrangle(collection): # Pass in the name of the collection
results = collection.find( # Query that collection
{"metadata.site": 29, "metadata.measurement": "P2"}, # Looking for readings from site 29, readings that are P2 readings
projection={"P2": 1, "timestamp": 1, "_id": 0}, # Return P2 readings and the timestamp associated with that reading
)
df = pd.DataFrame(results).set_index("timestamp") # Take the results and put them into a DataFrame
return df
# Use wrangle function to read the data from the nairobi collection into the DataFrame df
df = wrangle(nairobi) # Use wrangle to pass in the collection (nairobi (the object, not the variable))
df.head(10)
P2 | |
---|---|
timestamp | |
2018-09-01 00:00:02.472 | 34.43 |
2018-09-01 00:05:03.941 | 30.53 |
2018-09-01 00:10:04.374 | 22.80 |
2018-09-01 00:15:04.245 | 13.30 |
2018-09-01 00:20:04.869 | 16.57 |
2018-09-01 00:25:04.659 | 14.07 |
2018-09-01 00:30:05.406 | 16.33 |
2018-09-01 00:35:04.259 | 16.50 |
2018-09-01 00:40:04.219 | 12.43 |
2018-09-01 00:45:04.042 | 11.07 |
df.index[:5]
DatetimeIndex(['2018-09-01 00:00:02.472000', '2018-09-01 00:05:03.941000', '2018-09-01 00:10:04.374000', '2018-09-01 00:15:04.245000', '2018-09-01 00:20:04.869000'], dtype='datetime64[ns]', name='timestamp', freq=None)
df.index.tz_localize("UTC")[:5]
DatetimeIndex(['2018-09-01 00:00:02.472000+00:00', '2018-09-01 00:05:03.941000+00:00', '2018-09-01 00:10:04.374000+00:00', '2018-09-01 00:15:04.245000+00:00', '2018-09-01 00:20:04.869000+00:00'], dtype='datetime64[ns, UTC]', name='timestamp', freq=None)
df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")[:5]
DatetimeIndex(['2018-09-01 03:00:02.472000+03:00', '2018-09-01 03:05:03.941000+03:00', '2018-09-01 03:10:04.374000+03:00', '2018-09-01 03:15:04.245000+03:00', '2018-09-01 03:20:04.869000+03:00'], dtype='datetime64[ns, Africa/Nairobi]', name='timestamp', freq=None)
# Add to wrangle function so that the DatetimeIndex for df is localized to the correct timezone, "Africa/Nairobi"
def wrangle(collection): # Pass in the name of the collection
results = collection.find( # Query that collection
{"metadata.site": 29, "metadata.measurement": "P2"}, # Looking for readings from site 29, readings that are P2 readings
projection={"P2": 1, "timestamp": 1, "_id": 0}, # Return P2 readings and the timestamp associated with that reading
)
df = pd.DataFrame(results).set_index("timestamp") # Take the results and put them into a DataFrame
# Localize Timezone
df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")
return df
df = wrangle(nairobi) # Use wrangle to pass in the collection (nairobi (the object, not the variable))
print(df.shape)
df.head(10)
(32907, 1)
P2 | |
---|---|
timestamp | |
2018-09-01 03:00:02.472000+03:00 | 34.43 |
2018-09-01 03:05:03.941000+03:00 | 30.53 |
2018-09-01 03:10:04.374000+03:00 | 22.80 |
2018-09-01 03:15:04.245000+03:00 | 13.30 |
2018-09-01 03:20:04.869000+03:00 | 16.57 |
2018-09-01 03:25:04.659000+03:00 | 14.07 |
2018-09-01 03:30:05.406000+03:00 | 16.33 |
2018-09-01 03:35:04.259000+03:00 | 16.50 |
2018-09-01 03:40:04.219000+03:00 | 12.43 |
2018-09-01 03:45:04.042000+03:00 | 11.07 |
# Create a boxplot of the "P2" readings in df
fig, ax = plt.subplots(figsize=(15, 6))
df["P2"].plot(kind="box", vert=False, title="Distribution of PM2.5 Readings", ax=ax);
There's a lot of outliers.
print(df.shape)
df.head(10)
(32907, 1)
P2 | |
---|---|
timestamp | |
2018-09-01 03:00:02.472000+03:00 | 34.43 |
2018-09-01 03:05:03.941000+03:00 | 30.53 |
2018-09-01 03:10:04.374000+03:00 | 22.80 |
2018-09-01 03:15:04.245000+03:00 | 13.30 |
2018-09-01 03:20:04.869000+03:00 | 16.57 |
2018-09-01 03:25:04.659000+03:00 | 14.07 |
2018-09-01 03:30:05.406000+03:00 | 16.33 |
2018-09-01 03:35:04.259000+03:00 | 16.50 |
2018-09-01 03:40:04.219000+03:00 | 12.43 |
2018-09-01 03:45:04.042000+03:00 | 11.07 |
# Add to wrangle function so that all "P2" readings above 500 are dropped from the dataset
def wrangle(collection): # Pass in the name of the collection
results = collection.find( # Query that collection
{"metadata.site": 29, "metadata.measurement": "P2"}, # Looking for readings from site 29, readings that are P2 readings
projection={"P2": 1, "timestamp": 1, "_id": 0}, # Return P2 readings and the timestamp associated with that reading
)
df = pd.DataFrame(results).set_index("timestamp") # Take the results and put them into a DataFrame
# Localize Timezone
df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")
# Remove outliers
df = df[df["P2"] < 500] # Give me all rows of df where column "P2" is less than 500
return df
df = wrangle(nairobi) # Use wrangle to pass in the collection (nairobi (the object, not the variable))
print(df.shape)
df.head(10)
(32906, 1)
P2 | |
---|---|
timestamp | |
2018-09-01 03:00:02.472000+03:00 | 34.43 |
2018-09-01 03:05:03.941000+03:00 | 30.53 |
2018-09-01 03:10:04.374000+03:00 | 22.80 |
2018-09-01 03:15:04.245000+03:00 | 13.30 |
2018-09-01 03:20:04.869000+03:00 | 16.57 |
2018-09-01 03:25:04.659000+03:00 | 14.07 |
2018-09-01 03:30:05.406000+03:00 | 16.33 |
2018-09-01 03:35:04.259000+03:00 | 16.50 |
2018-09-01 03:40:04.219000+03:00 | 12.43 |
2018-09-01 03:45:04.042000+03:00 | 11.07 |
# Create a time series plot of the "P2" readings in df
fig, ax = plt.subplots(figsize=(15, 6))
df["P2"].plot(xlabel="Time", ylabel="PM2.5", title="PM2.5 Time Series", ax=ax);
This Times Series data is going all over the place. As we move left to right, we're moving progressively into the future. Right before 2018-10-01 we see a gap where we may be missing some readings so there's a drop.
df.head()
P2 | |
---|---|
timestamp | |
2018-09-01 03:00:02.472000+03:00 | 34.43 |
2018-09-01 03:05:03.941000+03:00 | 30.53 |
2018-09-01 03:10:04.374000+03:00 | 22.80 |
2018-09-01 03:15:04.245000+03:00 | 13.30 |
2018-09-01 03:20:04.869000+03:00 | 16.57 |
Our data is every 5 minutes or so, but we want every hour. So we need resampling.
# Resample
df["P2"].resample("1H") # "P2" column is what we're interested in
<pandas.core.resample.DatetimeIndexResampler object at 0x7fd2743dc1c0>
df["P2"].resample("1H").mean().head()
timestamp 2018-09-01 03:00:00+03:00 17.541667 2018-09-01 04:00:00+03:00 15.800000 2018-09-01 05:00:00+03:00 11.420000 2018-09-01 06:00:00+03:00 11.614167 2018-09-01 07:00:00+03:00 17.665000 Freq: H, Name: P2, dtype: float64
Whenever we're doing Time Series predictions, you need to do them at regular intervals and can't have missing values.
df["P2"].resample("1H").mean().isnull()
timestamp 2018-09-01 03:00:00+03:00 False 2018-09-01 04:00:00+03:00 False 2018-09-01 05:00:00+03:00 False 2018-09-01 06:00:00+03:00 False 2018-09-01 07:00:00+03:00 False ... 2018-12-31 22:00:00+03:00 False 2018-12-31 23:00:00+03:00 False 2019-01-01 00:00:00+03:00 False 2019-01-01 01:00:00+03:00 False 2019-01-01 02:00:00+03:00 False Freq: H, Name: P2, Length: 2928, dtype: bool
df["P2"].resample("1H").mean().isnull().sum()
102
We have 102 missing values and we need to impute them.
# Fill null values
df["P2"].resample("1H").mean().fillna(method="ffill").head()
timestamp 2018-09-01 03:00:00+03:00 17.541667 2018-09-01 04:00:00+03:00 15.800000 2018-09-01 05:00:00+03:00 11.420000 2018-09-01 06:00:00+03:00 11.614167 2018-09-01 07:00:00+03:00 17.665000 Freq: H, Name: P2, dtype: float64
df["P2"].resample("1H").mean().fillna(method="ffill").isnull().sum()
0
df["P2"].resample("1H").mean().fillna(method="ffill").to_frame().head()
P2 | |
---|---|
timestamp | |
2018-09-01 03:00:00+03:00 | 17.541667 |
2018-09-01 04:00:00+03:00 | 15.800000 |
2018-09-01 05:00:00+03:00 | 11.420000 |
2018-09-01 06:00:00+03:00 | 11.614167 |
2018-09-01 07:00:00+03:00 | 17.665000 |
# Add to my wrangle function to resample df to provide the mean "P2" reading for each hour
def wrangle(collection): # Pass in the name of the collection
results = collection.find( # Query that collection
{"metadata.site": 29, "metadata.measurement": "P2"}, # Looking for readings from site 29, readings that are P2 readings
projection={"P2": 1, "timestamp": 1, "_id": 0}, # Return P2 readings and the timestamp associated with that reading
)
df = pd.DataFrame(results).set_index("timestamp") # Take the results and put them into a DataFrame
# Localize Timezone
df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")
# Remove outliers
df = df[df["P2"] < 500] # Give me all rows of df where column "P2" is less than 500
# Resample to 1H window, ffill missing values
df = df["P2"].resample("1H").mean().fillna(method="ffill").to_frame()
return df
df = wrangle(nairobi) # Use wrangle to pass in the collection (nairobi (the object, not the variable))
print(df.shape)
df.head(10)
(2928, 1)
P2 | |
---|---|
timestamp | |
2018-09-01 03:00:00+03:00 | 17.541667 |
2018-09-01 04:00:00+03:00 | 15.800000 |
2018-09-01 05:00:00+03:00 | 11.420000 |
2018-09-01 06:00:00+03:00 | 11.614167 |
2018-09-01 07:00:00+03:00 | 17.665000 |
2018-09-01 08:00:00+03:00 | 21.016667 |
2018-09-01 09:00:00+03:00 | 22.589167 |
2018-09-01 10:00:00+03:00 | 18.605833 |
2018-09-01 11:00:00+03:00 | 14.022500 |
2018-09-01 12:00:00+03:00 | 13.150000 |
df["P2"].rolling(168)
Rolling [window=168,center=False,axis=0,method=single]
df["P2"].rolling(168).mean().head()
timestamp 2018-09-01 03:00:00+03:00 NaN 2018-09-01 04:00:00+03:00 NaN 2018-09-01 05:00:00+03:00 NaN 2018-09-01 06:00:00+03:00 NaN 2018-09-01 07:00:00+03:00 NaN Freq: H, Name: P2, dtype: float64
df["P2"].rolling(168).mean().tail()
timestamp 2018-12-31 22:00:00+03:00 6.932995 2018-12-31 23:00:00+03:00 6.935927 2019-01-01 00:00:00+03:00 6.938348 2019-01-01 01:00:00+03:00 6.973928 2019-01-01 02:00:00+03:00 7.043333 Freq: H, Name: P2, dtype: float64
df["P2"].rolling(168).mean().isnull().sum()
167
One less than the window. It's telling me in the first 168 hours in my datset, there is nothing there because I don't have a full window needed to calculate the average.
# Plot the rolling average of the "P2" readings in df
fig, ax = plt.subplots(figsize=(15, 6))
df["P2"].rolling(168).mean().plot(ax=ax, ylabel="PM2.5", title="Weekly Rolling Average"); # Use a window size of 168 (the number of hours in a week)
The big peaks and low valleys are evened out because we averaged them out. We no longer have extreme values. The lines are also a lot smoother than before we did the rolling average. We smoothed out the data.
Trend? There is no trend that I'm seeing. No general movement upwards nor downwards. No very regular cyclical movement. Maybe if we have more data like several years (either seasonal or general trends). In this case, we don't.
df["P2"].head()
timestamp 2018-09-01 03:00:00+03:00 17.541667 2018-09-01 04:00:00+03:00 15.800000 2018-09-01 05:00:00+03:00 11.420000 2018-09-01 06:00:00+03:00 11.614167 2018-09-01 07:00:00+03:00 17.665000 Freq: H, Name: P2, dtype: float64
df["P2"].shift(1).head()
timestamp 2018-09-01 03:00:00+03:00 NaN 2018-09-01 04:00:00+03:00 17.541667 2018-09-01 05:00:00+03:00 15.800000 2018-09-01 06:00:00+03:00 11.420000 2018-09-01 07:00:00+03:00 11.614167 Freq: H, Name: P2, dtype: float64
df["P2.L1"] = df["P2"].shift(1)
df.head()
P2 | P2.L1 | |
---|---|---|
timestamp | ||
2018-09-01 03:00:00+03:00 | 17.541667 | NaN |
2018-09-01 04:00:00+03:00 | 15.800000 | 17.541667 |
2018-09-01 05:00:00+03:00 | 11.420000 | 15.800000 |
2018-09-01 06:00:00+03:00 | 11.614167 | 11.420000 |
2018-09-01 07:00:00+03:00 | 17.665000 | 11.614167 |
df["P2.L1"] = df["P2"].shift(1)
df.dropna().head()
P2 | P2.L1 | |
---|---|---|
timestamp | ||
2018-09-01 04:00:00+03:00 | 15.800000 | 17.541667 |
2018-09-01 05:00:00+03:00 | 11.420000 | 15.800000 |
2018-09-01 06:00:00+03:00 | 11.614167 | 11.420000 |
2018-09-01 07:00:00+03:00 | 17.665000 | 11.614167 |
2018-09-01 08:00:00+03:00 | 21.016667 | 17.665000 |
# Add to my wrangle function to create a column called "P2.L1" that contains the mean"P2" reading from the previous hour
def wrangle(collection): # Pass in the name of the collection
results = collection.find( # Query that collection
{"metadata.site": 29, "metadata.measurement": "P2"}, # Looking for readings from site 29, readings that are P2 readings
projection={"P2": 1, "timestamp": 1, "_id": 0}, # Return P2 readings and the timestamp associated with that reading
)
df = pd.DataFrame(results).set_index("timestamp") # Take the results and put them into a DataFrame
# Localize Timezone
df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")
# Remove outliers
df = df[df["P2"] < 500] # Give me all rows of df where column "P2" is less than 500
# Resample to 1H window, ffill missing values
df = df["P2"].resample("1H").mean().fillna(method="ffill").to_frame()
# Add lag feature
df["P2.L1"] = df["P2"].shift(1)
# Drop NaN rows
df.dropna(inplace=True)
return df
df = wrangle(nairobi)
print(df.shape)
df.head(10)
(2927, 2)
P2 | P2.L1 | |
---|---|---|
timestamp | ||
2018-09-01 04:00:00+03:00 | 15.800000 | 17.541667 |
2018-09-01 05:00:00+03:00 | 11.420000 | 15.800000 |
2018-09-01 06:00:00+03:00 | 11.614167 | 11.420000 |
2018-09-01 07:00:00+03:00 | 17.665000 | 11.614167 |
2018-09-01 08:00:00+03:00 | 21.016667 | 17.665000 |
2018-09-01 09:00:00+03:00 | 22.589167 | 21.016667 |
2018-09-01 10:00:00+03:00 | 18.605833 | 22.589167 |
2018-09-01 11:00:00+03:00 | 14.022500 | 18.605833 |
2018-09-01 12:00:00+03:00 | 13.150000 | 14.022500 |
2018-09-01 13:00:00+03:00 | 12.806667 | 13.150000 |
# Create a correlation matrix for df
df.corr()
P2 | P2.L1 | |
---|---|---|
P2 | 1.000000 | 0.650679 |
P2.L1 | 0.650679 | 1.000000 |
# Create a scatter plot that shows PM 2.5 mean reading for each our as a function of the mean reading from the previous hour
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(x=df["P2.L1"], y=df["P2"])
ax.plot([0, 120], [0, 120], linestyle="--", color="orange")
plt.xlabel("P2.L1") # "P2.L1" should be on the x-axis
plt.ylabel("P2") # "P2" should be on the y-axis
plt.title("PM2.5 Autocorrelation");
target = "P2"
y = df[target]
X = ...
y.head()
timestamp 2018-09-01 04:00:00+03:00 15.800000 2018-09-01 05:00:00+03:00 11.420000 2018-09-01 06:00:00+03:00 11.614167 2018-09-01 07:00:00+03:00 17.665000 2018-09-01 08:00:00+03:00 21.016667 Freq: H, Name: P2, dtype: float64
# Split the DataFrame df into the feature matrix X and the target vector y
target = "P2"
y = df[target]
X = df.drop(columns=target)
X.head()
P2.L1 | |
---|---|
timestamp | |
2018-09-01 04:00:00+03:00 | 17.541667 |
2018-09-01 05:00:00+03:00 | 15.800000 |
2018-09-01 06:00:00+03:00 | 11.420000 |
2018-09-01 07:00:00+03:00 | 11.614167 |
2018-09-01 08:00:00+03:00 | 17.665000 |
len(X) * 0.8
2341.6
int(len(X) * 0.8)
2341
cutoff = int(len(X) * 0.8)
len(X.iloc[:cutoff])
2341
# Split X and y into training and test sets
cutoff = int(len(X) * 0.8)
X_train, y_train = X.iloc[:cutoff], y.iloc[:cutoff] # The first 80% of the data should be in my training set
X_test, y_test = X.iloc[cutoff:], y.iloc[cutoff:] # The remaining 20% should be in the test set
len(X_train)
2341
len(X_train)+len(X_test)
2927
len(X_train)+len(X_test) == len(X)
True
# Calculate the baseline mean absolute error for my model
y_pred_baseline = [y_train.mean()] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Mean P2 Reading:", round(y_train.mean(), 2))
print("Baseline MAE:", round(mae_baseline, 2))
Mean P2 Reading: 9.27 Baseline MAE: 3.89
# Instantiate a LinearRegression model named "model"
model = LinearRegression()
model.fit(X_train, y_train) # Fit it to my training data
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
# Calculate the training and test mean absolute error for my model
training_mae = mean_absolute_error(y_train, model.predict(X_train))
test_mae = mean_absolute_error(y_test, model.predict(X_test))
print("Training MAE:", round(training_mae, 2))
print("Test MAE:", round(test_mae, 2))
Training MAE: 2.46 Test MAE: 1.8
# Extract the intercept and coefficient from my model
intercept = model.intercept_.round(2)
coefficient = model.coef_.round(2)[0]
print(f"P2 = {intercept} + ({coefficient} * P2.L1)")
P2 = 3.36 + (0.64 * P2.L1)
# Create a DataFrame df_pred_test that has two columns: "y_test" and "y_pred"
df_pred_test = pd.DataFrame(
{
"y_test": y_test, # The first should contain the true values for my test set
"y_pred": model.predict(X_test) # The second should contain my model's predictions
}
)
df_pred_test.head()
y_test | y_pred | |
---|---|---|
timestamp | ||
2018-12-07 17:00:00+03:00 | 7.070000 | 8.478927 |
2018-12-07 18:00:00+03:00 | 8.968333 | 7.865485 |
2018-12-07 19:00:00+03:00 | 11.630833 | 9.076421 |
2018-12-07 20:00:00+03:00 | 11.525833 | 10.774814 |
2018-12-07 21:00:00+03:00 | 9.533333 | 10.707836 |
# Create a time series line plot for the values in test_predictions using plotly express
fig = px.line(df_pred_test, labels={"value":"P2"}) # Be sure that the y-axis is properly labeled as "P2"
fig.show()