__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px


# Import the CSV file "data/brasil-real-estate-1.csv" into the DataFrame (df1)
df1 = pd.read_csv("data/brasil-real-estate-1.csv")
df1.head()


# Inspect df1 using the info and head methods
df1.info()
df1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12834 entries, 0 to 12833
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   property_type            12834 non-null  object 
 1   place_with_parent_names  12834 non-null  object 
 2   region                   12834 non-null  object 
 3   lat-lon                  11551 non-null  object 
 4   area_m2                  12834 non-null  float64
 5   price_usd                12834 non-null  object 
dtypes: float64(1), object(5)
memory usage: 601.7+ KB


# Drop all the rows with NaN values from the DataFrame (df1)
df1.dropna(inplace=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11551 entries, 0 to 12833
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   property_type            11551 non-null  object 
 1   place_with_parent_names  11551 non-null  object 
 2   region                   11551 non-null  object 
 3   lat-lon                  11551 non-null  object 
 4   area_m2                  11551 non-null  float64
 5   price_usd                11551 non-null  object 
dtypes: float64(1), object(5)
memory usage: 631.7+ KB


# Use the "lat-lon" column to create two separate columns in df1: "lat" and "lon"
df1[["lat", "lon"]] = df1["lat-lon"].str.split(",", expand=True)
df1.head()


df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11551 entries, 0 to 12833
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   property_type            11551 non-null  object 
 1   place_with_parent_names  11551 non-null  object 
 2   region                   11551 non-null  object 
 3   lat-lon                  11551 non-null  object 
 4   area_m2                  11551 non-null  float64
 5   price_usd                11551 non-null  object 
 6   lat                      11551 non-null  object 
 7   lon                      11551 non-null  object 
dtypes: float64(1), object(7)
memory usage: 812.2+ KB


df1["lat"] = (
    df1["lat"]
    .astype(float) # Make sure that the data type for these new columns is float
)


df1["lon"] = (
    df1["lon"]
    .astype(float)
)


df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11551 entries, 0 to 12833
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   property_type            11551 non-null  object 
 1   place_with_parent_names  11551 non-null  object 
 2   region                   11551 non-null  object 
 3   lat-lon                  11551 non-null  object 
 4   area_m2                  11551 non-null  float64
 5   price_usd                11551 non-null  object 
 6   lat                      11551 non-null  float64
 7   lon                      11551 non-null  float64
dtypes: float64(3), object(5)
memory usage: 812.2+ KB


df1.head()


# Use the "place_with_parent_names" column to create a "state" column for df1
df1["state"] = df1["place_with_parent_names"].str.split("|", expand=True)[2]
df1.head()


# Transform the "price_usd" column of df1 so that all values are floating-point numbers instead of strings
df1["price_usd"] = (
    df1["price_usd"]
    .str.replace("$","", regex=False)
    .str.replace(",","")
    .astype(float)
)


df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11551 entries, 0 to 12833
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   property_type            11551 non-null  object 
 1   place_with_parent_names  11551 non-null  object 
 2   region                   11551 non-null  object 
 3   lat-lon                  11551 non-null  object 
 4   area_m2                  11551 non-null  float64
 5   price_usd                11551 non-null  float64
 6   lat                      11551 non-null  float64
 7   lon                      11551 non-null  float64
 8   state                    11551 non-null  object 
dtypes: float64(4), object(5)
memory usage: 902.4+ KB


# Drop the "lat-lon" and "place_with_parent_names" columns from df1
df1.drop(columns=["place_with_parent_names", "lat-lon"], inplace=True)
df1.head()


# Import the CSV file "data/brasil-real-estate-2.csv" into the DataFrame (df2)
df2 = pd.read_csv("data/brasil-real-estate-2.csv")


# Assess df2 using the info and head methods
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12833 entries, 0 to 12832
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   property_type  12833 non-null  object 
 1   state          12833 non-null  object 
 2   region         12833 non-null  object 
 3   lat            12833 non-null  float64
 4   lon            12833 non-null  float64
 5   area_m2        11293 non-null  float64
 6   price_brl      12833 non-null  float64
dtypes: float64(4), object(3)
memory usage: 701.9+ KB


# Use the "price_brl" column to create a new column named "price_usd"
df2["price_usd"] = (df2["price_brl"] / 3.19).round(2) # When this data was collected in 2015 and 2016, a US dollar cost 3.19 Brazilian reals


df2.head()


# Drop the "price_brl" column from df2, as well as any rows that have NaN values
df2.drop(columns=["price_brl"], inplace=True)
df2.dropna(inplace=True)


df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11293 entries, 0 to 12832
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   property_type  11293 non-null  object 
 1   state          11293 non-null  object 
 2   region         11293 non-null  object 
 3   lat            11293 non-null  float64
 4   lon            11293 non-null  float64
 5   area_m2        11293 non-null  float64
 6   price_usd      11293 non-null  float64
dtypes: float64(4), object(3)
memory usage: 705.8+ KB


# Concatenate df1 and df2 to create a new DataFrame named df
df = pd.concat([df1, df2])
print("df shape:", df.shape)

df shape: (22844, 7)


# Create a scatter_mapbox showing the location of the properties in df
fig = px.scatter_mapbox(
    df,
    lat="lat",
    lon="lon",
    center={"lat": -14.2, "lon": -51.9},  # Map will be centered on Brazil
    width=600,
    height=600,
    hover_data=["price_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()


# Create a DataFrame (summary_stats) with the summary statistics for the "area_m2" and "price_usd" columns
summary_stats = df[["area_m2", "price_usd"]].describe()
summary_stats


# Create a histogram of "price_usd"
plt.hist(df["price_usd"])
plt.xlabel("Price [USD]") # x-axis has the label "Price [USD]"
plt.ylabel("Frequency"); # y-axis has the label "Frequency"
plt.title("Distribution of Home Sizes"); # plot has the title "Distribution of Home Prices"


# Create a horizontal boxplot of "area_m2"
plt.boxplot(df["area_m2"], vert=False)
plt.xlabel("Area [sq meters]") # x-axis has the label "Area [sq meters]"
plt.title("Distribution of Home Sizes"); # plot has the title "Distribution of Home Sizes"


# Create a Series (via Groupby) named "mean_price_by_region" that shows the mean home price in each region in Brazil
mean_price_by_region = df.groupby("region")["price_usd"].mean().sort_values(ascending=True) # Sort from smallest to largest
mean_price_by_region

region
Central-West    178596.283663
North           181308.958188
Northeast       185422.985482
South           189012.345360
Southeast       208996.762761
Name: price_usd, dtype: float64


# Use "mean_price_by_region" to create a bar chart
mean_price_by_region.plot(
    kind="bar",
    xlabel="Region", # label the x-axis as "Region"
    ylabel="Mean Price [USD]", # y-axis as "Mean Price [USD]"
    title="Mean Home Price by Region" # Title "Mean Home Price by Region"
);


# Create a DataFrame (df_south) that contains all the homes from df that are in the "South" region
df_south = df[df["region"] == "South"]
df_south.head()


# Create a Series (via value_counts) "homes_by_state" that contains the number of properties in each state in df_south
homes_by_state = df_south["state"].value_counts().head(10)
homes_by_state

Rio Grande do Sul    2643
Santa Catarina       2634
Paraná               2544
Name: state, dtype: int64


# Create a scatter plot showing price vs. area for the state in df_south that has the largest number of properties
Rio_Grande = df[df["state"] == "Rio Grande do Sul"]
plt.scatter(x=Rio_Grande["area_m2"],y=Rio_Grande["price_usd"]);
plt.xlabel("Area [sq meters]") # label the x-axis "Area [sq meters]"
plt.ylabel("Price [USD]") # y-axis "Price [USD]"
plt.title("Rio Grande do Sul: Price vs. Area"); # title "Rio Grande do Sul: Price vs. Area"


# Create a dictionary "south_states_corr", where the keys are the names of the three states in the "South" region and their associated values are the correlation coefficient between "area_m2" and "price_usd" in that state

south_states_corr = {}

df_south["area_m2"].corr(df_south["price_usd"])

south_states_corr

{}


south_states_corr = {}
south_states = df[df["region"] == 'South']["state"].unique()
for state in south_states:
    df_state = df[df["state"] == state]
    p_corr = df_state["area_m2"].corr(df_state["price_usd"])
    south_states_corr[state] = p_corr
    
south_states_corr

{'Paraná': 0.5436659935502657,
 'Rio Grande do Sul': 0.5773267433871903,
 'Santa Catarina': 0.5068121769989855}

	property_type	place_with_parent_names	region	lat-lon	area_m2	price_usd
0	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6443051,-35.7088142	110.0	$187,230.85
1	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6430934,-35.70484	65.0	$81,133.37
2	house	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6227033,-35.7297953	211.0	$154,465.45
3	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.622837,-35.719556	99.0	$146,013.20
4	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.654955,-35.700227	55.0	$101,416.71

	property_type	place_with_parent_names	region	lat-lon	area_m2	price_usd
0	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6443051,-35.7088142	110.0	$187,230.85
1	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6430934,-35.70484	65.0	$81,133.37
2	house	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6227033,-35.7297953	211.0	$154,465.45
3	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.622837,-35.719556	99.0	$146,013.20
4	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.654955,-35.700227	55.0	$101,416.71

	property_type	place_with_parent_names	region	lat-lon	area_m2	price_usd	lat	lon
0	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6443051,-35.7088142	110.0	$187,230.85	-9.6443051	-35.7088142
1	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6430934,-35.70484	65.0	$81,133.37	-9.6430934	-35.70484
2	house	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6227033,-35.7297953	211.0	$154,465.45	-9.6227033	-35.7297953
3	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.622837,-35.719556	99.0	$146,013.20	-9.622837	-35.719556
4	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.654955,-35.700227	55.0	$101,416.71	-9.654955	-35.700227

	property_type	place_with_parent_names	region	lat-lon	area_m2	price_usd	lat	lon
0	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6443051,-35.7088142	110.0	$187,230.85	-9.644305	-35.708814
1	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6430934,-35.70484	65.0	$81,133.37	-9.643093	-35.704840
2	house	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6227033,-35.7297953	211.0	$154,465.45	-9.622703	-35.729795
3	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.622837,-35.719556	99.0	$146,013.20	-9.622837	-35.719556
4	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.654955,-35.700227	55.0	$101,416.71	-9.654955	-35.700227

	property_type	place_with_parent_names	region	lat-lon	area_m2	price_usd	lat	lon	state
0	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6443051,-35.7088142	110.0	$187,230.85	-9.644305	-35.708814	Alagoas
1	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6430934,-35.70484	65.0	$81,133.37	-9.643093	-35.704840	Alagoas
2	house	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.6227033,-35.7297953	211.0	$154,465.45	-9.622703	-35.729795	Alagoas
3	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.622837,-35.719556	99.0	$146,013.20	-9.622837	-35.719556	Alagoas
4	apartment	\|Brasil\|Alagoas\|Maceió\|	Northeast	-9.654955,-35.700227	55.0	$101,416.71	-9.654955	-35.700227	Alagoas

Data Wrangling¶

Exploratory Data Analysis (EDA)¶

	property_type	state	region	lat	lon	area_m2	price_brl
0	apartment	Pernambuco	Northeast	-8.134204	-34.906326	72.0	414222.98
1	apartment	Pernambuco	Northeast	-8.126664	-34.903924	136.0	848408.53
2	apartment	Pernambuco	Northeast	-8.125550	-34.907601	75.0	299438.28
3	apartment	Pernambuco	Northeast	-8.120249	-34.895920	187.0	848408.53
4	apartment	Pernambuco	Northeast	-8.142666	-34.906906	80.0	464129.36

	area_m2	price_usd
count	22844.000000	22844.000000
mean	115.020224	194987.315515
std	47.742932	103617.682979
min	53.000000	74892.340000
25%	76.000000	113898.770000
50%	103.000000	165697.555000
75%	142.000000	246900.882500
max	252.000000	525659.720000

	property_type	region	area_m2	price_usd	lat	lon	state
9304	apartment	South	127.0	296448.85	-25.455704	-49.292918	Paraná
9305	apartment	South	104.0	219996.25	-25.455704	-49.292918	Paraná
9306	apartment	South	100.0	194210.50	-25.460236	-49.293812	Paraná
9307	apartment	South	77.0	149252.94	-25.460236	-49.293812	Paraná
9308	apartment	South	73.0	144167.75	-25.460236	-49.293812	Paraná