__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import warnings
import plotly.express as px
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from glob import glob
from category_encoders import OneHotEncoder
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)


# Build my "wrangle" function

def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Distrito Federal", less than 100,000
    mask_D = df["place_with_parent_names"].str.contains("Distrito Federal")
    df = df[mask_D]
    
    # Subset type
    mask_apt = df["property_type"] == "apartment"
    df = df[mask_apt]
    
    # Subset price
    mask_price = df["price_aprox_usd"] < 100_000
    df = df[mask_price]
    
    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # Split "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)
    
    # Get borough name
    df["borough"] = df["place_with_parent_names"].str.split("|", expand=True)[1]
    df.drop(columns="place_with_parent_names", inplace=True)
    
    # Drop over 50% Null Values
    df.drop(columns = ["floor", "expenses", "surface_total_in_m2", "price_usd_per_m2", "rooms"], inplace = True)
    
    # Drop low- and high- cardinality categorical variables
    df.drop(columns = ["operation", "property_type", "currency", "properati_url"], inplace = True)
    
    # Remove leakage
    df.drop(columns = ['price', 'price_aprox_local_currency', 'price_per_m2'], inplace = True)
    
    return df


# Use glob to create the list files. It contains the filenames of all the Mexico City real estate CSVs except for data/mexico-city-test-features.csv
files = glob("data/mexico-city-real-estate-*.csv")
files

['data/mexico-city-real-estate-5.csv',
 'data/mexico-city-real-estate-4.csv',
 'data/mexico-city-real-estate-1.csv',
 'data/mexico-city-real-estate-3.csv',
 'data/mexico-city-real-estate-2.csv']


# Combine my wrangle function, a list comprehension, and pd.concat to create a DataFrame df. It contains all the properties from the five CSVs in files
frames = [wrangle(file) for file in files]
df = pd.concat(frames, ignore_index = True)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   price_aprox_usd        5473 non-null   float64
 1   surface_covered_in_m2  5473 non-null   float64
 2   lat                    5149 non-null   float64
 3   lon                    5149 non-null   float64
 4   borough                5473 non-null   object 
dtypes: float64(4), object(1)
memory usage: 213.9+ KB
None


# Plot distribution of price
plt.hist(df["price_aprox_usd"])
plt.xlabel("Area [sq meters]") # Label x-axis "Area [sq meters]"
plt.ylabel("Count") # Label y-axis "Count"
plt.title("Distribution of Apartment Prices"); # Title "Distribution of Apartment Prices"


# Plot price vs area
plt.scatter(x=df["surface_covered_in_m2"], y=df["price_aprox_usd"])
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")
plt.title("Mexico City: Price vs. Area");


# Plot Mapbox location and price
fig = px.scatter_mapbox(
    df,  # My DataFrame
    lat="lat",
    lon="lon",
    width=600,  # Width of map
    height=600,  # Height of map
    color="price_aprox_usd",
    hover_data=["price_aprox_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()


# Split data into feature matrix "X_train"
target = "price_aprox_usd"
X_train = df.drop(columns=target)
X_train.head()


# Split data into target vector "y_train"
target = "price_aprox_usd"
y_train = df[target]
y_train.head()

0    82737.39
1    72197.60
2    44277.72
3    60589.45
4    47429.08
Name: price_aprox_usd, dtype: float64


# Calculate the baseline mean absolute error for my model
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
baseline_mae = mean_absolute_error(y_train, y_pred_baseline)
print("Mean apt price:", y_mean)
print("Baseline MAE:", baseline_mae)

Mean apt price: 54246.5314982642
Baseline MAE: 17239.939475888295


# Build Model
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()
)
# Fit model
model.fit(X_train, y_train)

Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['borough'], use_cat_names=True)),
                ('simpleimputer', SimpleImputer()), ('ridge', Ridge())])

Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['borough'], use_cat_names=True)),
                ('simpleimputer', SimpleImputer()), ('ridge', Ridge())])

OneHotEncoder(cols=['borough'], use_cat_names=True)

SimpleImputer()

Ridge()


# Read the CSV file "data/mexico-city-test-features.csv" into the DataFrame X_test
X_test = pd.read_csv("data/mexico-city-test-features.csv")
print(X_test.info())
X_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1041 entries, 0 to 1040
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surface_covered_in_m2  1041 non-null   float64
 1   lat                    986 non-null    float64
 2   lon                    986 non-null    float64
 3   borough                1041 non-null   object 
dtypes: float64(3), object(1)
memory usage: 32.7+ KB
None


# Use my model to generate a Series of predictions for X_test
y_test_pred = pd.Series(model.predict(X_test))
y_test_pred.head()

0    53538.366480
1    53171.988369
2    34263.884179
3    53488.425607
4    68738.924884
dtype: float64


model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()
)

model.fit(X_train, y_train)

Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['borough'], use_cat_names=True)),
                ('simpleimputer', SimpleImputer()), ('ridge', Ridge())])

Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['borough'], use_cat_names=True)),
                ('simpleimputer', SimpleImputer()), ('ridge', Ridge())])

OneHotEncoder(cols=['borough'], use_cat_names=True)

SimpleImputer()

Ridge()


coefficients = model.named_steps['ridge'].coef_
features = model.named_steps["onehotencoder"].get_feature_names()
feat_imp = pd.Series(coefficients, index=features).sort_values(ascending=True, key=abs)
feat_imp

surface_covered_in_m2               291.654156
borough_Cuauhtémoc                 -350.531990
borough_Iztacalco                   405.403127
lat                                 478.901375
borough_Xochimilco                  929.857400
borough_Miguel Hidalgo             1977.314718
borough_Azcapotzalco               2459.288646
lon                               -2492.221814
borough_Álvaro Obregón             3275.121061
borough_Coyoacán                   3737.561001
borough_Venustiano Carranza       -5609.918629
borough_La Magdalena Contreras    -5925.666450
borough_Gustavo A. Madero         -6637.429757
borough_Cuajimalpa de Morelos      9157.269123
borough_Tlalpan                   10319.429804
borough_Iztapalapa               -13349.017448
borough_Benito Juárez             13778.188880
borough_Tláhuac                  -14166.869486
dtype: float64


feat_imp.sort_values(key=abs)

surface_covered_in_m2               291.654156
borough_Cuauhtémoc                 -350.531990
borough_Iztacalco                   405.403127
lat                                 478.901375
borough_Xochimilco                  929.857400
borough_Miguel Hidalgo             1977.314718
borough_Azcapotzalco               2459.288646
lon                               -2492.221814
borough_Álvaro Obregón             3275.121061
borough_Coyoacán                   3737.561001
borough_Venustiano Carranza       -5609.918629
borough_La Magdalena Contreras    -5925.666450
borough_Gustavo A. Madero         -6637.429757
borough_Cuajimalpa de Morelos      9157.269123
borough_Tlalpan                   10319.429804
borough_Iztapalapa               -13349.017448
borough_Benito Juárez             13778.188880
borough_Tláhuac                  -14166.869486
dtype: float64


# Create a horizontal bar chart that shows the 10 most influential coefficients for my model
feat_imp.sort_values(key=abs).tail(15).plot(kind="barh")
plt.xlabel("Importance [USD]") # Label x-axis as "Importance [USD]"
plt.ylabel("Feature") # Label y-axis as "Feature"
plt.title("Feature Importances for Apartment Price"); # Title chart as "Feature Importances for Apartment Price"

	surface_covered_in_m2	lat	lon	borough
0	75.0	19.362690	-99.150565	Benito Juárez
1	62.0	19.291345	-99.124312	Tlalpan
2	85.0	19.354987	-99.061709	Iztapalapa
3	52.0	19.469681	-99.086136	Gustavo A. Madero
4	53.0	19.443592	-99.121407	Venustiano Carranza

	surface_covered_in_m2	lat	lon	borough
0	60.0	19.493185	-99.205755	Azcapotzalco
1	55.0	19.307247	-99.166700	Coyoacán
2	50.0	19.363469	-99.010141	Iztapalapa
3	60.0	19.474655	-99.189277	Azcapotzalco
4	74.0	19.394628	-99.143842	Benito Juárez

Prepare Data¶

Import¶

Explore¶

Split¶

Build Model¶

Iterate¶

Evaluate¶

Communicate Results¶

	price_aprox_usd	surface_covered_in_m2	lat	lon	borough
0	82737.39	75.0	19.362690	-99.150565	Benito Juárez
1	72197.60	62.0	19.291345	-99.124312	Tlalpan
2	44277.72	85.0	19.354987	-99.061709	Iztapalapa
3	60589.45	52.0	19.469681	-99.086136	Gustavo A. Madero
4	47429.08	53.0	19.443592	-99.121407	Venustiano Carranza