Predicting Apartment Prices in Mexico City 🇲🇽
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
import warnings
import plotly.express as px
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from glob import glob
from category_encoders import OneHotEncoder
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted
warnings.simplefilter(action="ignore", category=FutureWarning)
I'll first write a wrangle
function that takes the name of a CSV file as input and returns a DataFrame. My function will do the following steps:
"Distrito Federal"
) that cost less than $100,000."surface_covered_in_m2"
."lat"
and "lon"
columns."borough"
feature from the "place_with_parent_names"
column."price_aprox_usd"
.# Build my "wrangle" function
def wrangle(filepath):
# Read CSV file
df = pd.read_csv(filepath)
# Subset data: Apartments in "Distrito Federal", less than 100,000
mask_D = df["place_with_parent_names"].str.contains("Distrito Federal")
df = df[mask_D]
# Subset type
mask_apt = df["property_type"] == "apartment"
df = df[mask_apt]
# Subset price
mask_price = df["price_aprox_usd"] < 100_000
df = df[mask_price]
# Subset data: Remove outliers for "surface_covered_in_m2"
low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
mask_area = df["surface_covered_in_m2"].between(low, high)
df = df[mask_area]
# Split "lat-lon" column
df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
df.drop(columns="lat-lon", inplace=True)
# Get borough name
df["borough"] = df["place_with_parent_names"].str.split("|", expand=True)[1]
df.drop(columns="place_with_parent_names", inplace=True)
# Drop over 50% Null Values
df.drop(columns = ["floor", "expenses", "surface_total_in_m2", "price_usd_per_m2", "rooms"], inplace = True)
# Drop low- and high- cardinality categorical variables
df.drop(columns = ["operation", "property_type", "currency", "properati_url"], inplace = True)
# Remove leakage
df.drop(columns = ['price', 'price_aprox_local_currency', 'price_per_m2'], inplace = True)
return df
# Use glob to create the list files. It contains the filenames of all the Mexico City real estate CSVs except for data/mexico-city-test-features.csv
files = glob("data/mexico-city-real-estate-*.csv")
files
['data/mexico-city-real-estate-5.csv', 'data/mexico-city-real-estate-4.csv', 'data/mexico-city-real-estate-1.csv', 'data/mexico-city-real-estate-3.csv', 'data/mexico-city-real-estate-2.csv']
# Combine my wrangle function, a list comprehension, and pd.concat to create a DataFrame df. It contains all the properties from the five CSVs in files
frames = [wrangle(file) for file in files]
df = pd.concat(frames, ignore_index = True)
print(df.info())
df.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5473 entries, 0 to 5472 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price_aprox_usd 5473 non-null float64 1 surface_covered_in_m2 5473 non-null float64 2 lat 5149 non-null float64 3 lon 5149 non-null float64 4 borough 5473 non-null object dtypes: float64(4), object(1) memory usage: 213.9+ KB None
price_aprox_usd | surface_covered_in_m2 | lat | lon | borough | |
---|---|---|---|---|---|
0 | 82737.39 | 75.0 | 19.362690 | -99.150565 | Benito Juárez |
1 | 72197.60 | 62.0 | 19.291345 | -99.124312 | Tlalpan |
2 | 44277.72 | 85.0 | 19.354987 | -99.061709 | Iztapalapa |
3 | 60589.45 | 52.0 | 19.469681 | -99.086136 | Gustavo A. Madero |
4 | 47429.08 | 53.0 | 19.443592 | -99.121407 | Venustiano Carranza |
First, I'll create a histogram showing the distribution of apartment prices "price_aprox_usd"
in df
.
# Plot distribution of price
plt.hist(df["price_aprox_usd"])
plt.xlabel("Area [sq meters]") # Label x-axis "Area [sq meters]"
plt.ylabel("Count") # Label y-axis "Count"
plt.title("Distribution of Apartment Prices"); # Title "Distribution of Apartment Prices"
Now, I'll create a scatter plot that shows apartment price "price_aprox_usd"
as a function of apartment size ( "surface_covered_in_m2"
).
# Plot price vs area
plt.scatter(x=df["surface_covered_in_m2"], y=df["price_aprox_usd"])
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")
plt.title("Mexico City: Price vs. Area");
Finally, I'll create a Mapbox scatter plot that shows the location of the apartments in my dataset and represent their price using color.
# Plot Mapbox location and price
fig = px.scatter_mapbox(
df, # My DataFrame
lat="lat",
lon="lon",
width=600, # Width of map
height=600, # Height of map
color="price_aprox_usd",
hover_data=["price_aprox_usd"], # Display price when hovering mouse over house
)
fig.update_layout(mapbox_style="open-street-map")
fig.show()
First, I need to create my feature matrix X_train and target vector y_train. And my target is "price_aprox_usd"
.
# Split data into feature matrix "X_train"
target = "price_aprox_usd"
X_train = df.drop(columns=target)
X_train.head()
surface_covered_in_m2 | lat | lon | borough | |
---|---|---|---|---|
0 | 75.0 | 19.362690 | -99.150565 | Benito Juárez |
1 | 62.0 | 19.291345 | -99.124312 | Tlalpan |
2 | 85.0 | 19.354987 | -99.061709 | Iztapalapa |
3 | 52.0 | 19.469681 | -99.086136 | Gustavo A. Madero |
4 | 53.0 | 19.443592 | -99.121407 | Venustiano Carranza |
# Split data into target vector "y_train"
target = "price_aprox_usd"
y_train = df[target]
y_train.head()
0 82737.39 1 72197.60 2 44277.72 3 60589.45 4 47429.08 Name: price_aprox_usd, dtype: float64
# Calculate the baseline mean absolute error for my model
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
baseline_mae = mean_absolute_error(y_train, y_pred_baseline)
print("Mean apt price:", y_mean)
print("Baseline MAE:", baseline_mae)
Mean apt price: 54246.5314982642 Baseline MAE: 17239.939475888295
I'll create a pipeline named model that contains all the transformers necessary for this dataset and one of the predictors I've used during this project. Then fit my model to the training data
# Build Model
model = make_pipeline(
OneHotEncoder(use_cat_names=True),
SimpleImputer(),
Ridge()
)
# Fit model
model.fit(X_train, y_train)
Pipeline(steps=[('onehotencoder', OneHotEncoder(cols=['borough'], use_cat_names=True)), ('simpleimputer', SimpleImputer()), ('ridge', Ridge())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('onehotencoder', OneHotEncoder(cols=['borough'], use_cat_names=True)), ('simpleimputer', SimpleImputer()), ('ridge', Ridge())])
OneHotEncoder(cols=['borough'], use_cat_names=True)
SimpleImputer()
Ridge()
# Read the CSV file "data/mexico-city-test-features.csv" into the DataFrame X_test
X_test = pd.read_csv("data/mexico-city-test-features.csv")
print(X_test.info())
X_test.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1041 entries, 0 to 1040 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 surface_covered_in_m2 1041 non-null float64 1 lat 986 non-null float64 2 lon 986 non-null float64 3 borough 1041 non-null object dtypes: float64(3), object(1) memory usage: 32.7+ KB None
surface_covered_in_m2 | lat | lon | borough | |
---|---|---|---|---|
0 | 60.0 | 19.493185 | -99.205755 | Azcapotzalco |
1 | 55.0 | 19.307247 | -99.166700 | Coyoacán |
2 | 50.0 | 19.363469 | -99.010141 | Iztapalapa |
3 | 60.0 | 19.474655 | -99.189277 | Azcapotzalco |
4 | 74.0 | 19.394628 | -99.143842 | Benito Juárez |
# Use my model to generate a Series of predictions for X_test
y_test_pred = pd.Series(model.predict(X_test))
y_test_pred.head()
0 53538.366480 1 53171.988369 2 34263.884179 3 53488.425607 4 68738.924884 dtype: float64
I'll create a Series named "feat_imp"
. And the index contains the names of all the features my model considers when making predictions
model = make_pipeline(
OneHotEncoder(use_cat_names=True),
SimpleImputer(),
Ridge()
)
model.fit(X_train, y_train)
Pipeline(steps=[('onehotencoder', OneHotEncoder(cols=['borough'], use_cat_names=True)), ('simpleimputer', SimpleImputer()), ('ridge', Ridge())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('onehotencoder', OneHotEncoder(cols=['borough'], use_cat_names=True)), ('simpleimputer', SimpleImputer()), ('ridge', Ridge())])
OneHotEncoder(cols=['borough'], use_cat_names=True)
SimpleImputer()
Ridge()
coefficients = model.named_steps['ridge'].coef_
features = model.named_steps["onehotencoder"].get_feature_names()
feat_imp = pd.Series(coefficients, index=features).sort_values(ascending=True, key=abs)
feat_imp
surface_covered_in_m2 291.654156 borough_Cuauhtémoc -350.531990 borough_Iztacalco 405.403127 lat 478.901375 borough_Xochimilco 929.857400 borough_Miguel Hidalgo 1977.314718 borough_Azcapotzalco 2459.288646 lon -2492.221814 borough_Álvaro Obregón 3275.121061 borough_Coyoacán 3737.561001 borough_Venustiano Carranza -5609.918629 borough_La Magdalena Contreras -5925.666450 borough_Gustavo A. Madero -6637.429757 borough_Cuajimalpa de Morelos 9157.269123 borough_Tlalpan 10319.429804 borough_Iztapalapa -13349.017448 borough_Benito Juárez 13778.188880 borough_Tláhuac -14166.869486 dtype: float64
feat_imp.sort_values(key=abs)
surface_covered_in_m2 291.654156 borough_Cuauhtémoc -350.531990 borough_Iztacalco 405.403127 lat 478.901375 borough_Xochimilco 929.857400 borough_Miguel Hidalgo 1977.314718 borough_Azcapotzalco 2459.288646 lon -2492.221814 borough_Álvaro Obregón 3275.121061 borough_Coyoacán 3737.561001 borough_Venustiano Carranza -5609.918629 borough_La Magdalena Contreras -5925.666450 borough_Gustavo A. Madero -6637.429757 borough_Cuajimalpa de Morelos 9157.269123 borough_Tlalpan 10319.429804 borough_Iztapalapa -13349.017448 borough_Benito Juárez 13778.188880 borough_Tláhuac -14166.869486 dtype: float64
# Create a horizontal bar chart that shows the 10 most influential coefficients for my model
feat_imp.sort_values(key=abs).tail(15).plot(kind="barh")
plt.xlabel("Importance [USD]") # Label x-axis as "Importance [USD]"
plt.ylabel("Feature") # Label y-axis as "Feature"
plt.title("Feature Importances for Apartment Price"); # Title chart as "Feature Importances for Apartment Price"