__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import warnings
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)


def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Capital Federal", less than 400,000
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 400_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    return df


# Use wrangle function to create a DataFrame frame1 from the CSV file "data/buenos-aires-real-estate-1.csv"
frame1 = wrangle("data/buenos-aires-real-estate-1.csv")
print(frame1.info())
frame1.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1343 entries, 4 to 8604
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   operation                   1343 non-null   object 
 1   property_type               1343 non-null   object 
 2   place_with_parent_names     1343 non-null   object 
 3   lat-lon                     1300 non-null   object 
 4   price                       1343 non-null   float64
 5   currency                    1343 non-null   object 
 6   price_aprox_local_currency  1343 non-null   float64
 7   price_aprox_usd             1343 non-null   float64
 8   surface_total_in_m2         965 non-null    float64
 9   surface_covered_in_m2       1343 non-null   float64
 10  price_usd_per_m2            927 non-null    float64
 11  price_per_m2                1343 non-null   float64
 12  floor                       379 non-null    float64
 13  rooms                       1078 non-null   float64
 14  expenses                    349 non-null    object 
 15  properati_url               1343 non-null   object 
dtypes: float64(9), object(7)
memory usage: 178.4+ KB
None


frame1["lat-lon"].head()

4     -34.5846508988,-58.4546932614
9            -34.6389789,-58.500115
29            -34.615847,-58.459957
40          -34.6252219,-58.3823825
41          -34.6106102,-58.4125107
Name: lat-lon, dtype: object


frame1["lat-lon"].str.split(",").head()

4     [-34.5846508988, -58.4546932614]
9            [-34.6389789, -58.500115]
29            [-34.615847, -58.459957]
40          [-34.6252219, -58.3823825]
41          [-34.6106102, -58.4125107]
Name: lat-lon, dtype: object


frame1["lat-lon"].str.split(",", expand=True).head()


# Recast my data
frame1["lat-lon"].str.split(",", expand=True).astype(float)


frame1["lat-lon"].str.split(",", expand=True).astype(float).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1343 entries, 4 to 8604
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1300 non-null   float64
 1   1       1300 non-null   float64
dtypes: float64(2)
memory usage: 31.5 KB


def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Capital Federal", less than 400,000
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 400_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]
    
    # Split the "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns = "lat-lon", inplace=True) # Make change to the df inplace, not generate a new df

    return df


# Check to see "lat-lon" is dropped and new columns "lat" and "long" columns are created
frame1 = wrangle("data/buenos-aires-real-estate-1.csv")
print(frame1.info())
frame1.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1343 entries, 4 to 8604
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   operation                   1343 non-null   object 
 1   property_type               1343 non-null   object 
 2   place_with_parent_names     1343 non-null   object 
 3   price                       1343 non-null   float64
 4   currency                    1343 non-null   object 
 5   price_aprox_local_currency  1343 non-null   float64
 6   price_aprox_usd             1343 non-null   float64
 7   surface_total_in_m2         965 non-null    float64
 8   surface_covered_in_m2       1343 non-null   float64
 9   price_usd_per_m2            927 non-null    float64
 10  price_per_m2                1343 non-null   float64
 11  floor                       379 non-null    float64
 12  rooms                       1078 non-null   float64
 13  expenses                    349 non-null    object 
 14  properati_url               1343 non-null   object 
 15  lat                         1300 non-null   float64
 16  lon                         1300 non-null   float64
dtypes: float64(11), object(6)
memory usage: 188.9+ KB
None


# Use the revised "wrangle" function create a DataFrames "frame2" from the file "data/buenos-aires-real-estate-2.csv"
frame2 = wrangle("data/buenos-aires-real-estate-2.csv")


frame1.head()


frame2.head()


# Use pd.concat to concatenate frame1 and frame2 into a new DataFrame df
df = pd.concat([frame1, frame2], ignore_index=True) # Set the ignore_index argument to True
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2658 entries, 0 to 2657
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   operation                   2658 non-null   object 
 1   property_type               2658 non-null   object 
 2   place_with_parent_names     2658 non-null   object 
 3   price                       2658 non-null   float64
 4   currency                    2658 non-null   object 
 5   price_aprox_local_currency  2658 non-null   float64
 6   price_aprox_usd             2658 non-null   float64
 7   surface_total_in_m2         1898 non-null   float64
 8   surface_covered_in_m2       2658 non-null   float64
 9   price_usd_per_m2            1818 non-null   float64
 10  price_per_m2                2658 non-null   float64
 11  floor                       769 non-null    float64
 12  rooms                       2137 non-null   float64
 13  expenses                    688 non-null    object 
 14  properati_url               2658 non-null   object 
 15  lat                         2561 non-null   float64
 16  lon                         2561 non-null   float64
dtypes: float64(11), object(6)
memory usage: 353.1+ KB
None


# Create a Mapbox scatter plot that shows the location of the apartments in df
fig = px.scatter_mapbox(
    df,  # My DataFrame
    lat="lat",
    lon="lon",
    width=600,  # Width of map
    height=600,  # Height of map
    color="price_aprox_usd",
    hover_data=["price_aprox_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()


# Create 3D scatter plot
fig = px.scatter_3d(
    df,
    x="lon", # Label "lon" on the x-axis
    y="lat", # Label "lat" on the y-axis
    z="price_aprox_usd", # Label "price_aprox_usd" on the z-axis
    labels={"lon": "longitude", "lat": "latitude", "price_aprox_usd": "price"},
    width=600,
    height=500,
)

# Refine formatting
fig.update_traces(
    marker={"size": 4, "line": {"width": 2, "color": "DarkSlateGrey"}},
    selector={"mode": "markers"},
)

# Display figure
fig.show()


# Create the feature matrix named X_train
features = ["lon", "lat"] # It should contain two features: ["lon", "lat"]
X_train = df[features]
X_train.head()


X_train.shape

(2658, 2)


# Create the target vector named y_train, which will be used to train my model
target = "price_aprox_usd" 
y_train = df[target]
y_train.head()

0    129000.0
1     87000.0
2    118000.0
3     57000.0
4     90000.0
Name: price_aprox_usd, dtype: float64


# Target vector should be one-dimensional (most cases)
y_train.shape

(2658,)


# Calculate the mean of my target vector y_train and assign it to the variable y_mean
y_mean = y_train.mean()


# Create a list named y_pred_baseline containing the value of y_mean repeated so it's the same length at y_train
y_pred_baseline = [y_mean] * len(y_train)
y_pred_baseline[:5]

[134732.97340481562,
 134732.97340481562,
 134732.97340481562,
 134732.97340481562,
 134732.97340481562]


# Calculate baseline mean absolute error for predictions in y_pred_baseline 

mae_baseline = mean_absolute_error(y_train, y_pred_baseline)# As compared to the true targets in y_train

print("Mean apt price", round(y_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))

Mean apt price 134732.97
Baseline MAE: 45422.75


# This input contains NaN. Won't run
model = LinearRegression()
model.fit(X_train, y_train)


X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2658 entries, 0 to 2657
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   lon     2561 non-null   float64
 1   lat     2561 non-null   float64
dtypes: float64(2)
memory usage: 41.7 KB


# Instantiate a SimpleImputer (transformer) named imputer
imputer = SimpleImputer()


# Fit transformer imputer to the feature matrix X
imputer.fit(X_train)

SimpleImputer()

SimpleImputer()


# Check work
check_is_fitted(imputer)


# Use imputer to transform the feature matrix X_train
XT_train = imputer.transform(X_train)
pd.DataFrame(XT_train, columns=X_train.columns).info() # Assign the transformed data to the variable XT_train

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2658 entries, 0 to 2657
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   lon     2658 non-null   float64
 1   lat     2658 non-null   float64
dtypes: float64(2)
memory usage: 41.7 KB


# Create a pipeline named model that contains a SimpleImputer transformer 
model = make_pipeline(
    SimpleImputer(),
    LinearRegression() # Followed by a LinearRegression predictor
)


# Fit my model to the data, X_train and y_train
model.fit(X_train, y_train)

Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('linearregression', LinearRegression())])

Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('linearregression', LinearRegression())])

SimpleImputer()

LinearRegression()


# Check my work
check_is_fitted(model["linearregression"])


# Create a list of predictions for the observations in your feature matrix X_train. Name this list y_pred_training
y_pred_training = model.predict(X_train)


# Calculate the training mean absolute error for predictions in y_pred_training as compared to the true targets in y_train
mae_training = mean_absolute_error(y_train, y_pred_training)
print("Training MAE:", round(mae_training, 2))

Training MAE: 42962.72


# Import test data "data/buenos-aires-test-features.csv" into a DataFrame 
X_test = pd.read_csv("data/buenos-aires-test-features.csv")[features]
y_pred_test = pd.Series(model.predict(X_test)) # Generate a Series of predictions using my model
y_pred_test.head()

0    136372.324695
1    168620.352353
2    130231.628267
3    102497.549527
4    123482.077850
dtype: float64


# Extract the intercept for my model 
intercept = model.named_steps["linearregression"].intercept_.round() # Round to whole number
coefficients = model.named_steps["linearregression"].coef_.round()
intercept

38113587.0


# Extract the coefficients for my model
intercept = model.named_steps["linearregression"].intercept_.round() # Round to whole number
coefficients = model.named_steps["linearregression"].coef_.round()
coefficients

array([196709., 765467.])


print(f"apt_price = {intercept} + {coefficients} * surface_covered")

apt_price = 38113587.0 + [196709. 765467.] * surface_covered


# Create a 3D scatter plot
fig = px.scatter_3d(
    df,
    x="lon", # "lon" on the x-axis
    y="lat", # "lat" on the y-axis
    z="price_aprox_usd", # "price_aprox_usd" on the z-axis
    labels={"lon": "longitude", "lat": "latitude", "price_aprox_usd": "price"},
    width=600,
    height=500,
)

# Create x and y coordinates for model representation
x_plane = np.linspace(df["lon"].min(), df["lon"].max(), 10)
y_plane = np.linspace(df["lat"].min(), df["lat"].max(), 10)
xx, yy = np.meshgrid(x_plane, y_plane)

# Use model to predict z coordinates
z_plane = model.predict(pd.DataFrame({"lon": x_plane, "lat": y_plane}))
zz = np.tile(z_plane, (10, 1))

# Add plane to figure
fig.add_trace(go.Surface(x=xx, y=yy, z=zz))

# Refine formatting
fig.update_traces(
    marker={"size": 4, "line": {"width": 2, "color": "DarkSlateGrey"}},
    selector={"mode": "markers"},
)

# Display figure
fig.show()

	0	1
4	-34.584651	-58.454693
9	-34.638979	-58.500115
29	-34.615847	-58.459957
40	-34.625222	-58.382382
41	-34.610610	-58.412511
...	...	...
8589	-34.631591	-58.370191
8590	-34.604555	-58.418206
8593	-34.624002	-58.390588
8601	-34.601455	-58.378132
8604	NaN	NaN

	operation	property_type	place_with_parent_names	price	currency	price_aprox_local_currency	price_aprox_usd	surface_total_in_m2	surface_covered_in_m2	price_usd_per_m2	price_per_m2	floor	rooms	expenses	properati_url	lat	lon
2	sell	apartment	\|Argentina\|Capital Federal\|Recoleta\|	215000.0	USD	3259916.00	215000.00	40.0	35.0	5375.000000	6142.857143	NaN	1.0	3500.0	http://recoleta.properati.com.ar/12j4v_venta_d...	-34.588993	-58.400133
9	sell	apartment	\|Argentina\|Capital Federal\|Recoleta\|	341550.0	USD	5178717.72	341550.00	NaN	90.0	NaN	3795.000000	8.0	2.0	NaN	http://recoleta.properati.com.ar/100t0_venta_d...	-34.588044	-58.398066
12	sell	apartment	\|Argentina\|Capital Federal\|Monserrat\|	1386000.0	ARS	1382153.13	91156.62	39.0	33.0	2337.349231	42000.000000	NaN	NaN	NaN	http://monserrat.properati.com.ar/t05l_venta_d...	-34.623320	-58.397461
13	sell	apartment	\|Argentina\|Capital Federal\|Belgrano\|	105000.0	USD	1592052.00	105000.00	NaN	33.0	NaN	3181.818182	1.0	1.0	NaN	http://belgrano.properati.com.ar/zsd5_venta_de...	-34.553897	-58.451939
17	sell	apartment	\|Argentina\|Capital Federal\|Villa del Parque\|	89681.0	USD	1359779.19	89681.00	46.0	39.0	1949.586957	2299.512821	NaN	1.0	1500.0	http://villa-del-parque.properati.com.ar/12q2f...	-34.628813	-58.472230

Prepare Data¶

Import¶

Explore¶

Split¶

Build Model¶

Iterate¶

Evaluate¶

Communicate Results¶

	operation	property_type	place_with_parent_names	lat-lon	price	currency	price_aprox_local_currency	price_aprox_usd	surface_total_in_m2	surface_covered_in_m2	price_usd_per_m2	price_per_m2	floor	rooms	expenses	properati_url
4	sell	apartment	\|Argentina\|Capital Federal\|Chacarita\|	-34.5846508988,-58.4546932614	129000.0	USD	1955949.6	129000.0	76.0	70.0	1697.368421	1842.857143	NaN	NaN	NaN	http://chacarita.properati.com.ar/10qlv_venta_...
9	sell	apartment	\|Argentina\|Capital Federal\|Villa Luro\|	-34.6389789,-58.500115	87000.0	USD	1319128.8	87000.0	48.0	42.0	1812.500000	2071.428571	NaN	NaN	NaN	http://villa-luro.properati.com.ar/12m82_venta...
29	sell	apartment	\|Argentina\|Capital Federal\|Caballito\|	-34.615847,-58.459957	118000.0	USD	1789163.2	118000.0	NaN	54.0	NaN	2185.185185	NaN	2.0	NaN	http://caballito.properati.com.ar/11wqh_venta_...
40	sell	apartment	\|Argentina\|Capital Federal\|Constitución\|	-34.6252219,-58.3823825	57000.0	USD	864256.8	57000.0	42.0	42.0	1357.142857	1357.142857	5.0	2.0	364	http://constitucion.properati.com.ar/k2f0_vent...
41	sell	apartment	\|Argentina\|Capital Federal\|Once\|	-34.6106102,-58.4125107	90000.0	USD	1364616.0	90000.0	57.0	50.0	1578.947368	1800.000000	NaN	3.0	450	http://once.properati.com.ar/suwa_venta_depart...

	0	1
4	-34.5846508988	-58.4546932614
9	-34.6389789	-58.500115
29	-34.615847	-58.459957
40	-34.6252219	-58.3823825
41	-34.6106102	-58.4125107