Housing Price Predictor for Buenos Aires 🇦🇷
Part 2: Predicting Housing Price with Location
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
This project builds on my Predicting Housing Price with Size
project. Here, I create a more complex wrangle function, use it to clean more data, and build a model that considers more features when predicting apartment price.
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted
warnings.simplefilter(action="ignore", category=FutureWarning)
def wrangle(filepath):
# Read CSV file
df = pd.read_csv(filepath)
# Subset data: Apartments in "Capital Federal", less than 400,000
mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
mask_apt = df["property_type"] == "apartment"
mask_price = df["price_aprox_usd"] < 400_000
df = df[mask_ba & mask_apt & mask_price]
# Subset data: Remove outliers for "surface_covered_in_m2"
low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
mask_area = df["surface_covered_in_m2"].between(low, high)
df = df[mask_area]
return df
Import CSV 1
# Use wrangle function to create a DataFrame frame1 from the CSV file "data/buenos-aires-real-estate-1.csv"
frame1 = wrangle("data/buenos-aires-real-estate-1.csv")
print(frame1.info())
frame1.head()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1343 entries, 4 to 8604 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 operation 1343 non-null object 1 property_type 1343 non-null object 2 place_with_parent_names 1343 non-null object 3 lat-lon 1300 non-null object 4 price 1343 non-null float64 5 currency 1343 non-null object 6 price_aprox_local_currency 1343 non-null float64 7 price_aprox_usd 1343 non-null float64 8 surface_total_in_m2 965 non-null float64 9 surface_covered_in_m2 1343 non-null float64 10 price_usd_per_m2 927 non-null float64 11 price_per_m2 1343 non-null float64 12 floor 379 non-null float64 13 rooms 1078 non-null float64 14 expenses 349 non-null object 15 properati_url 1343 non-null object dtypes: float64(9), object(7) memory usage: 178.4+ KB None
operation | property_type | place_with_parent_names | lat-lon | price | currency | price_aprox_local_currency | price_aprox_usd | surface_total_in_m2 | surface_covered_in_m2 | price_usd_per_m2 | price_per_m2 | floor | rooms | expenses | properati_url | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | sell | apartment | |Argentina|Capital Federal|Chacarita| | -34.5846508988,-58.4546932614 | 129000.0 | USD | 1955949.6 | 129000.0 | 76.0 | 70.0 | 1697.368421 | 1842.857143 | NaN | NaN | NaN | http://chacarita.properati.com.ar/10qlv_venta_... |
9 | sell | apartment | |Argentina|Capital Federal|Villa Luro| | -34.6389789,-58.500115 | 87000.0 | USD | 1319128.8 | 87000.0 | 48.0 | 42.0 | 1812.500000 | 2071.428571 | NaN | NaN | NaN | http://villa-luro.properati.com.ar/12m82_venta... |
29 | sell | apartment | |Argentina|Capital Federal|Caballito| | -34.615847,-58.459957 | 118000.0 | USD | 1789163.2 | 118000.0 | NaN | 54.0 | NaN | 2185.185185 | NaN | 2.0 | NaN | http://caballito.properati.com.ar/11wqh_venta_... |
40 | sell | apartment | |Argentina|Capital Federal|Constitución| | -34.6252219,-58.3823825 | 57000.0 | USD | 864256.8 | 57000.0 | 42.0 | 42.0 | 1357.142857 | 1357.142857 | 5.0 | 2.0 | 364 | http://constitucion.properati.com.ar/k2f0_vent... |
41 | sell | apartment | |Argentina|Capital Federal|Once| | -34.6106102,-58.4125107 | 90000.0 | USD | 1364616.0 | 90000.0 | 57.0 | 50.0 | 1578.947368 | 1800.000000 | NaN | 3.0 | 450 | http://once.properati.com.ar/suwa_venta_depart... |
For my model, I'm going to consider apartment location, specifically, latitude and longitude. Looking at the output from frame1.info()
, I can see that the location information is in a single column where the data type is object
(pandas term for str
in this case). In order to build my model, I need latitude and longitude to each be in their own column where the data type is float
.
Split "Lat-Lon" Column
Here, I'll edit my wrangle
function so that, in the DataFrame it returns, the "lat-lon"
column is replaced by separate "lat"
and "lon"
columns.
frame1["lat-lon"].head()
4 -34.5846508988,-58.4546932614 9 -34.6389789,-58.500115 29 -34.615847,-58.459957 40 -34.6252219,-58.3823825 41 -34.6106102,-58.4125107 Name: lat-lon, dtype: object
frame1["lat-lon"].str.split(",").head()
4 [-34.5846508988, -58.4546932614] 9 [-34.6389789, -58.500115] 29 [-34.615847, -58.459957] 40 [-34.6252219, -58.3823825] 41 [-34.6106102, -58.4125107] Name: lat-lon, dtype: object
frame1["lat-lon"].str.split(",", expand=True).head()
0 | 1 | |
---|---|---|
4 | -34.5846508988 | -58.4546932614 |
9 | -34.6389789 | -58.500115 |
29 | -34.615847 | -58.459957 |
40 | -34.6252219 | -58.3823825 |
41 | -34.6106102 | -58.4125107 |
# Recast my data
frame1["lat-lon"].str.split(",", expand=True).astype(float)
0 | 1 | |
---|---|---|
4 | -34.584651 | -58.454693 |
9 | -34.638979 | -58.500115 |
29 | -34.615847 | -58.459957 |
40 | -34.625222 | -58.382382 |
41 | -34.610610 | -58.412511 |
... | ... | ... |
8589 | -34.631591 | -58.370191 |
8590 | -34.604555 | -58.418206 |
8593 | -34.624002 | -58.390588 |
8601 | -34.601455 | -58.378132 |
8604 | NaN | NaN |
1343 rows × 2 columns
frame1["lat-lon"].str.split(",", expand=True).astype(float).info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1343 entries, 4 to 8604 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 0 1300 non-null float64 1 1 1300 non-null float64 dtypes: float64(2) memory usage: 31.5 KB
def wrangle(filepath):
# Read CSV file
df = pd.read_csv(filepath)
# Subset data: Apartments in "Capital Federal", less than 400,000
mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
mask_apt = df["property_type"] == "apartment"
mask_price = df["price_aprox_usd"] < 400_000
df = df[mask_ba & mask_apt & mask_price]
# Subset data: Remove outliers for "surface_covered_in_m2"
low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
mask_area = df["surface_covered_in_m2"].between(low, high)
df = df[mask_area]
# Split the "lat-lon" column
df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
df.drop(columns = "lat-lon", inplace=True) # Make change to the df inplace, not generate a new df
return df
Summary
"lat-lon"
column to my wrangle functiondf
inside the scope of my function"lat-lon"
# Check to see "lat-lon" is dropped and new columns "lat" and "long" columns are created
frame1 = wrangle("data/buenos-aires-real-estate-1.csv")
print(frame1.info())
frame1.head()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1343 entries, 4 to 8604 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 operation 1343 non-null object 1 property_type 1343 non-null object 2 place_with_parent_names 1343 non-null object 3 price 1343 non-null float64 4 currency 1343 non-null object 5 price_aprox_local_currency 1343 non-null float64 6 price_aprox_usd 1343 non-null float64 7 surface_total_in_m2 965 non-null float64 8 surface_covered_in_m2 1343 non-null float64 9 price_usd_per_m2 927 non-null float64 10 price_per_m2 1343 non-null float64 11 floor 379 non-null float64 12 rooms 1078 non-null float64 13 expenses 349 non-null object 14 properati_url 1343 non-null object 15 lat 1300 non-null float64 16 lon 1300 non-null float64 dtypes: float64(11), object(6) memory usage: 188.9+ KB None
operation | property_type | place_with_parent_names | price | currency | price_aprox_local_currency | price_aprox_usd | surface_total_in_m2 | surface_covered_in_m2 | price_usd_per_m2 | price_per_m2 | floor | rooms | expenses | properati_url | lat | lon | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | sell | apartment | |Argentina|Capital Federal|Chacarita| | 129000.0 | USD | 1955949.6 | 129000.0 | 76.0 | 70.0 | 1697.368421 | 1842.857143 | NaN | NaN | NaN | http://chacarita.properati.com.ar/10qlv_venta_... | -34.584651 | -58.454693 |
9 | sell | apartment | |Argentina|Capital Federal|Villa Luro| | 87000.0 | USD | 1319128.8 | 87000.0 | 48.0 | 42.0 | 1812.500000 | 2071.428571 | NaN | NaN | NaN | http://villa-luro.properati.com.ar/12m82_venta... | -34.638979 | -58.500115 |
29 | sell | apartment | |Argentina|Capital Federal|Caballito| | 118000.0 | USD | 1789163.2 | 118000.0 | NaN | 54.0 | NaN | 2185.185185 | NaN | 2.0 | NaN | http://caballito.properati.com.ar/11wqh_venta_... | -34.615847 | -58.459957 |
40 | sell | apartment | |Argentina|Capital Federal|Constitución| | 57000.0 | USD | 864256.8 | 57000.0 | 42.0 | 42.0 | 1357.142857 | 1357.142857 | 5.0 | 2.0 | 364 | http://constitucion.properati.com.ar/k2f0_vent... | -34.625222 | -58.382382 |
41 | sell | apartment | |Argentina|Capital Federal|Once| | 90000.0 | USD | 1364616.0 | 90000.0 | 57.0 | 50.0 | 1578.947368 | 1800.000000 | NaN | 3.0 | 450 | http://once.properati.com.ar/suwa_venta_depart... | -34.610610 | -58.412511 |
Now that my wrangle
function is working, I can use it to clean more data.
Import CSV 2
# Use the revised "wrangle" function create a DataFrames "frame2" from the file "data/buenos-aires-real-estate-2.csv"
frame2 = wrangle("data/buenos-aires-real-estate-2.csv")
Using a function is much quicker than cleaning each file individually. I can now combine my DataFrames so I can use then to train my model.
frame1.head()
operation | property_type | place_with_parent_names | price | currency | price_aprox_local_currency | price_aprox_usd | surface_total_in_m2 | surface_covered_in_m2 | price_usd_per_m2 | price_per_m2 | floor | rooms | expenses | properati_url | lat | lon | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | sell | apartment | |Argentina|Capital Federal|Chacarita| | 129000.0 | USD | 1955949.6 | 129000.0 | 76.0 | 70.0 | 1697.368421 | 1842.857143 | NaN | NaN | NaN | http://chacarita.properati.com.ar/10qlv_venta_... | -34.584651 | -58.454693 |
9 | sell | apartment | |Argentina|Capital Federal|Villa Luro| | 87000.0 | USD | 1319128.8 | 87000.0 | 48.0 | 42.0 | 1812.500000 | 2071.428571 | NaN | NaN | NaN | http://villa-luro.properati.com.ar/12m82_venta... | -34.638979 | -58.500115 |
29 | sell | apartment | |Argentina|Capital Federal|Caballito| | 118000.0 | USD | 1789163.2 | 118000.0 | NaN | 54.0 | NaN | 2185.185185 | NaN | 2.0 | NaN | http://caballito.properati.com.ar/11wqh_venta_... | -34.615847 | -58.459957 |
40 | sell | apartment | |Argentina|Capital Federal|Constitución| | 57000.0 | USD | 864256.8 | 57000.0 | 42.0 | 42.0 | 1357.142857 | 1357.142857 | 5.0 | 2.0 | 364 | http://constitucion.properati.com.ar/k2f0_vent... | -34.625222 | -58.382382 |
41 | sell | apartment | |Argentina|Capital Federal|Once| | 90000.0 | USD | 1364616.0 | 90000.0 | 57.0 | 50.0 | 1578.947368 | 1800.000000 | NaN | 3.0 | 450 | http://once.properati.com.ar/suwa_venta_depart... | -34.610610 | -58.412511 |
frame2.head()
operation | property_type | place_with_parent_names | price | currency | price_aprox_local_currency | price_aprox_usd | surface_total_in_m2 | surface_covered_in_m2 | price_usd_per_m2 | price_per_m2 | floor | rooms | expenses | properati_url | lat | lon | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | sell | apartment | |Argentina|Capital Federal|Recoleta| | 215000.0 | USD | 3259916.00 | 215000.00 | 40.0 | 35.0 | 5375.000000 | 6142.857143 | NaN | 1.0 | 3500.0 | http://recoleta.properati.com.ar/12j4v_venta_d... | -34.588993 | -58.400133 |
9 | sell | apartment | |Argentina|Capital Federal|Recoleta| | 341550.0 | USD | 5178717.72 | 341550.00 | NaN | 90.0 | NaN | 3795.000000 | 8.0 | 2.0 | NaN | http://recoleta.properati.com.ar/100t0_venta_d... | -34.588044 | -58.398066 |
12 | sell | apartment | |Argentina|Capital Federal|Monserrat| | 1386000.0 | ARS | 1382153.13 | 91156.62 | 39.0 | 33.0 | 2337.349231 | 42000.000000 | NaN | NaN | NaN | http://monserrat.properati.com.ar/t05l_venta_d... | -34.623320 | -58.397461 |
13 | sell | apartment | |Argentina|Capital Federal|Belgrano| | 105000.0 | USD | 1592052.00 | 105000.00 | NaN | 33.0 | NaN | 3181.818182 | 1.0 | 1.0 | NaN | http://belgrano.properati.com.ar/zsd5_venta_de... | -34.553897 | -58.451939 |
17 | sell | apartment | |Argentina|Capital Federal|Villa del Parque| | 89681.0 | USD | 1359779.19 | 89681.00 | 46.0 | 39.0 | 1949.586957 | 2299.512821 | NaN | 1.0 | 1500.0 | http://villa-del-parque.properati.com.ar/12q2f... | -34.628813 | -58.472230 |
# Use pd.concat to concatenate frame1 and frame2 into a new DataFrame df
df = pd.concat([frame1, frame2], ignore_index=True) # Set the ignore_index argument to True
print(df.info())
df.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2658 entries, 0 to 2657 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 operation 2658 non-null object 1 property_type 2658 non-null object 2 place_with_parent_names 2658 non-null object 3 price 2658 non-null float64 4 currency 2658 non-null object 5 price_aprox_local_currency 2658 non-null float64 6 price_aprox_usd 2658 non-null float64 7 surface_total_in_m2 1898 non-null float64 8 surface_covered_in_m2 2658 non-null float64 9 price_usd_per_m2 1818 non-null float64 10 price_per_m2 2658 non-null float64 11 floor 769 non-null float64 12 rooms 2137 non-null float64 13 expenses 688 non-null object 14 properati_url 2658 non-null object 15 lat 2561 non-null float64 16 lon 2561 non-null float64 dtypes: float64(11), object(6) memory usage: 353.1+ KB None
operation | property_type | place_with_parent_names | price | currency | price_aprox_local_currency | price_aprox_usd | surface_total_in_m2 | surface_covered_in_m2 | price_usd_per_m2 | price_per_m2 | floor | rooms | expenses | properati_url | lat | lon | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | sell | apartment | |Argentina|Capital Federal|Chacarita| | 129000.0 | USD | 1955949.6 | 129000.0 | 76.0 | 70.0 | 1697.368421 | 1842.857143 | NaN | NaN | NaN | http://chacarita.properati.com.ar/10qlv_venta_... | -34.584651 | -58.454693 |
1 | sell | apartment | |Argentina|Capital Federal|Villa Luro| | 87000.0 | USD | 1319128.8 | 87000.0 | 48.0 | 42.0 | 1812.500000 | 2071.428571 | NaN | NaN | NaN | http://villa-luro.properati.com.ar/12m82_venta... | -34.638979 | -58.500115 |
2 | sell | apartment | |Argentina|Capital Federal|Caballito| | 118000.0 | USD | 1789163.2 | 118000.0 | NaN | 54.0 | NaN | 2185.185185 | NaN | 2.0 | NaN | http://caballito.properati.com.ar/11wqh_venta_... | -34.615847 | -58.459957 |
3 | sell | apartment | |Argentina|Capital Federal|Constitución| | 57000.0 | USD | 864256.8 | 57000.0 | 42.0 | 42.0 | 1357.142857 | 1357.142857 | 5.0 | 2.0 | 364 | http://constitucion.properati.com.ar/k2f0_vent... | -34.625222 | -58.382382 |
4 | sell | apartment | |Argentina|Capital Federal|Once| | 90000.0 | USD | 1364616.0 | 90000.0 | 57.0 | 50.0 | 1578.947368 | 1800.000000 | NaN | 3.0 | 450 | http://once.properati.com.ar/suwa_venta_depart... | -34.610610 | -58.412511 |
In the Predicting Housing Price with Size
project, I built a simple linear model that predicted apartment price based on one feature, surface_covered_in_m2
. In this project, I'm building a multiple linear regression model that predicts price based on two features, lon
and lat
. This means that my data visualizations now have to communicate three pieces of information: Longitude, latitude, and price.
I can represent these three attributes on a two-dimensional screen. One option is to incorporate color into my scatter plot. For example, in the Mapbox scatter plot below, the location of each point represents latitude and longitude, and color represents price.
Scatter Mapbox Plot
# Create a Mapbox scatter plot that shows the location of the apartments in df
fig = px.scatter_mapbox(
df, # My DataFrame
lat="lat",
lon="lon",
width=600, # Width of map
height=600, # Height of map
color="price_aprox_usd",
hover_data=["price_aprox_usd"], # Display price when hovering mouse over house
)
fig.update_layout(mapbox_style="open-street-map")
fig.show()
Looking at the visualization, it shows us just how important location is in determining property price. As you move closer to the coastline, property prices tend to go up.
Another option is to add a third dimension to my scatter plot. I can plot longitude on the x-axis and latitude on the y-axis (like I did in the map above), and then add a z-axis with price.
3D Scatter Plot
# Create 3D scatter plot
fig = px.scatter_3d(
df,
x="lon", # Label "lon" on the x-axis
y="lat", # Label "lat" on the y-axis
z="price_aprox_usd", # Label "price_aprox_usd" on the z-axis
labels={"lon": "longitude", "lat": "latitude", "price_aprox_usd": "price"},
width=600,
height=500,
)
# Refine formatting
fig.update_traces(
marker={"size": 4, "line": {"width": 2, "color": "DarkSlateGrey"}},
selector={"mode": "markers"},
)
# Display figure
fig.show()
Even though I'm building a different model, the steps I follow will be the same. I will now separate my features (latitude and longitude) from my target (price).
Split Data: Feature Matrix
# Create the feature matrix named X_train
features = ["lon", "lat"] # It should contain two features: ["lon", "lat"]
X_train = df[features]
X_train.head()
lon | lat | |
---|---|---|
0 | -58.454693 | -34.584651 |
1 | -58.500115 | -34.638979 |
2 | -58.459957 | -34.615847 |
3 | -58.382382 | -34.625222 |
4 | -58.412511 | -34.610610 |
X_train.shape
(2658, 2)
Split Data: Target Vector
# Create the target vector named y_train, which will be used to train my model
target = "price_aprox_usd"
y_train = df[target]
y_train.head()
0 129000.0 1 87000.0 2 118000.0 3 57000.0 4 90000.0 Name: price_aprox_usd, dtype: float64
# Target vector should be one-dimensional (most cases)
y_train.shape
(2658,)
I need to set a baseline so I can evaluate my model's performance. As I've added more observations to my training data from Predicting Housing Price with Size
project, the value of y_mean
is not exactly the same.
Baseline: Mean
# Calculate the mean of my target vector y_train and assign it to the variable y_mean
y_mean = y_train.mean()
# Create a list named y_pred_baseline containing the value of y_mean repeated so it's the same length at y_train
y_pred_baseline = [y_mean] * len(y_train)
y_pred_baseline[:5]
[134732.97340481562, 134732.97340481562, 134732.97340481562, 134732.97340481562, 134732.97340481562]
Baseline: Mean Absolute Error
# Calculate baseline mean absolute error for predictions in y_pred_baseline
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)# As compared to the true targets in y_train
print("Mean apt price", round(y_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))
Mean apt price 134732.97 Baseline MAE: 45422.75
If I was to predict, for every house that it costs $135,000 , I would be on average be off by 50,000.
Looking at the output for df.info()
and the values in the "Non-Null Count"
column, I can see that a linear regression model can't handle observations where there are missing values.
Dropping rows that contained NaN
values isn't ideal in this case. This is because models generally perform better when they have more data to train with. Instead, I can fill in these missing values using information I get from the whole column via imputation. There are many different strategies for imputing missing values, and one of the most common is filling in the missing values with the mean of the column.
In addition to predictors like LinearRegression
, scikit-learn also has transformers that help us deal with issues like missing values. I'll add it to my model in this next step.
Instantiate Transformer
# This input contains NaN. Won't run
model = LinearRegression()
model.fit(X_train, y_train)
X_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2658 entries, 0 to 2657 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 lon 2561 non-null float64 1 lat 2561 non-null float64 dtypes: float64(2) memory usage: 41.7 KB
# Instantiate a SimpleImputer (transformer) named imputer
imputer = SimpleImputer()
Just like a predictor, a transformer has a fit
method. In the case of my SimpleImputer
, this is the step where it calculates the mean values for each numerical column.
Train Transformer
# Fit transformer imputer to the feature matrix X
imputer.fit(X_train)
SimpleImputer()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SimpleImputer()
# Check work
check_is_fitted(imputer)
Here's where transformers diverge from predictors. Instead of using a method like predict
, I use the transform
method. This is the step where the transformer fills in the missing values with the means it's calculated.
Transform Data
# Use imputer to transform the feature matrix X_train
XT_train = imputer.transform(X_train)
pd.DataFrame(XT_train, columns=X_train.columns).info() # Assign the transformed data to the variable XT_train
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2658 entries, 0 to 2657 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 lon 2658 non-null float64 1 lat 2658 non-null float64 dtypes: float64(2) memory usage: 41.7 KB
My data is free of missing values. But since a model may require multiple transformers, and doing all those transformations one-by-one is slow and likely to lead to errors, I can combine my transformer and predictor into a single object called a pipeline
.
Build Pipeline
# Create a pipeline named model that contains a SimpleImputer transformer
model = make_pipeline(
SimpleImputer(),
LinearRegression() # Followed by a LinearRegression predictor
)
I can have as many transformers as I want in a pipeline (one after another) but I can only have one predictor and it has to come at the end of the pipeline.
With my pipeline assembled, I'll use the fit
method, which will train the transformer, transform the data, then pass the transformed data to the predictor for training, all in one step. This is an easier mmethod.
Train Model
# Fit my model to the data, X_train and y_train
model.fit(X_train, y_train)
Pipeline(steps=[('simpleimputer', SimpleImputer()), ('linearregression', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('simpleimputer', SimpleImputer()), ('linearregression', LinearRegression())])
SimpleImputer()
LinearRegression()
# Check my work
check_is_fitted(model["linearregression"])
Now I can see how my trained model performs.
I'll start by evaluating my model's performance on the training data.
Generate Predictions
# Create a list of predictions for the observations in your feature matrix X_train. Name this list y_pred_training
y_pred_training = model.predict(X_train)
Training Mean Absolute Error
# Calculate the training mean absolute error for predictions in y_pred_training as compared to the true targets in y_train
mae_training = mean_absolute_error(y_train, y_pred_training)
print("Training MAE:", round(mae_training, 2))
Training MAE: 42962.72
It looks like my model performs a little better than the baseline. This suggests that latitude and longitude aren't as strong predictors of price as size is.
Now I can check my test performance. However, once I test my model, I can't make any more iterations.
# Import test data "data/buenos-aires-test-features.csv" into a DataFrame
X_test = pd.read_csv("data/buenos-aires-test-features.csv")[features]
y_pred_test = pd.Series(model.predict(X_test)) # Generate a Series of predictions using my model
y_pred_test.head()
0 136372.324695 1 168620.352353 2 130231.628267 3 102497.549527 4 123482.077850 dtype: float64
I want my test performance to be about the same as my training performance.
I now look at the equation my model has come up with for predicting price based on latitude and longitude. I'll need to expand on my formula to account for both features.
Extract Intercept and Coefficients
# Extract the intercept for my model
intercept = model.named_steps["linearregression"].intercept_.round() # Round to whole number
coefficients = model.named_steps["linearregression"].coef_.round()
intercept
38113587.0
# Extract the coefficients for my model
intercept = model.named_steps["linearregression"].intercept_.round() # Round to whole number
coefficients = model.named_steps["linearregression"].coef_.round()
coefficients
array([196709., 765467.])
print(f"apt_price = {intercept} + {coefficients} * surface_covered")
apt_price = 38113587.0 + [196709. 765467.] * surface_covered
Above tell me that as you move north and west, the predicted apartment price increases.
Model 3D Scatter Plot
# Create a 3D scatter plot
fig = px.scatter_3d(
df,
x="lon", # "lon" on the x-axis
y="lat", # "lat" on the y-axis
z="price_aprox_usd", # "price_aprox_usd" on the z-axis
labels={"lon": "longitude", "lat": "latitude", "price_aprox_usd": "price"},
width=600,
height=500,
)
# Create x and y coordinates for model representation
x_plane = np.linspace(df["lon"].min(), df["lon"].max(), 10)
y_plane = np.linspace(df["lat"].min(), df["lat"].max(), 10)
xx, yy = np.meshgrid(x_plane, y_plane)
# Use model to predict z coordinates
z_plane = model.predict(pd.DataFrame({"lon": x_plane, "lat": y_plane}))
zz = np.tile(z_plane, (10, 1))
# Add plane to figure
fig.add_trace(go.Surface(x=xx, y=yy, z=zz))
# Refine formatting
fig.update_traces(
marker={"size": 4, "line": {"width": 2, "color": "DarkSlateGrey"}},
selector={"mode": "markers"},
)
# Display figure
fig.show()