Exploring Property Prices in Brazil 🇧🇷
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
In this project, I'll work with a dataset of homes for sale in Brazil. My goal is to determine if there are regional differences in the real estate market. Also, I will look at Southern Brazil to see if there is a relationship between home size and price.
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
# Import the CSV file "data/brasil-real-estate-1.csv" into the DataFrame (df1)
df1 = pd.read_csv("data/brasil-real-estate-1.csv")
df1.head()
property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | |
---|---|---|---|---|---|---|
0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110.0 | $187,230.85 |
1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65.0 | $81,133.37 |
2 | house | |Brasil|Alagoas|Maceió| | Northeast | -9.6227033,-35.7297953 | 211.0 | $154,465.45 |
3 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.622837,-35.719556 | 99.0 | $146,013.20 |
4 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.654955,-35.700227 | 55.0 | $101,416.71 |
# Inspect df1 using the info and head methods
df1.info()
df1.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12834 entries, 0 to 12833 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 property_type 12834 non-null object 1 place_with_parent_names 12834 non-null object 2 region 12834 non-null object 3 lat-lon 11551 non-null object 4 area_m2 12834 non-null float64 5 price_usd 12834 non-null object dtypes: float64(1), object(5) memory usage: 601.7+ KB
property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | |
---|---|---|---|---|---|---|
0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110.0 | $187,230.85 |
1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65.0 | $81,133.37 |
2 | house | |Brasil|Alagoas|Maceió| | Northeast | -9.6227033,-35.7297953 | 211.0 | $154,465.45 |
3 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.622837,-35.719556 | 99.0 | $146,013.20 |
4 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.654955,-35.700227 | 55.0 | $101,416.71 |
# Drop all the rows with NaN values from the DataFrame (df1)
df1.dropna(inplace=True)
df1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 11551 entries, 0 to 12833 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 property_type 11551 non-null object 1 place_with_parent_names 11551 non-null object 2 region 11551 non-null object 3 lat-lon 11551 non-null object 4 area_m2 11551 non-null float64 5 price_usd 11551 non-null object dtypes: float64(1), object(5) memory usage: 631.7+ KB
# Use the "lat-lon" column to create two separate columns in df1: "lat" and "lon"
df1[["lat", "lon"]] = df1["lat-lon"].str.split(",", expand=True)
df1.head()
property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | lat | lon | |
---|---|---|---|---|---|---|---|---|
0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110.0 | $187,230.85 | -9.6443051 | -35.7088142 |
1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65.0 | $81,133.37 | -9.6430934 | -35.70484 |
2 | house | |Brasil|Alagoas|Maceió| | Northeast | -9.6227033,-35.7297953 | 211.0 | $154,465.45 | -9.6227033 | -35.7297953 |
3 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.622837,-35.719556 | 99.0 | $146,013.20 | -9.622837 | -35.719556 |
4 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.654955,-35.700227 | 55.0 | $101,416.71 | -9.654955 | -35.700227 |
df1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 11551 entries, 0 to 12833 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 property_type 11551 non-null object 1 place_with_parent_names 11551 non-null object 2 region 11551 non-null object 3 lat-lon 11551 non-null object 4 area_m2 11551 non-null float64 5 price_usd 11551 non-null object 6 lat 11551 non-null object 7 lon 11551 non-null object dtypes: float64(1), object(7) memory usage: 812.2+ KB
df1["lat"] = (
df1["lat"]
.astype(float) # Make sure that the data type for these new columns is float
)
df1["lon"] = (
df1["lon"]
.astype(float)
)
df1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 11551 entries, 0 to 12833 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 property_type 11551 non-null object 1 place_with_parent_names 11551 non-null object 2 region 11551 non-null object 3 lat-lon 11551 non-null object 4 area_m2 11551 non-null float64 5 price_usd 11551 non-null object 6 lat 11551 non-null float64 7 lon 11551 non-null float64 dtypes: float64(3), object(5) memory usage: 812.2+ KB
df1.head()
property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | lat | lon | |
---|---|---|---|---|---|---|---|---|
0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110.0 | $187,230.85 | -9.644305 | -35.708814 |
1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65.0 | $81,133.37 | -9.643093 | -35.704840 |
2 | house | |Brasil|Alagoas|Maceió| | Northeast | -9.6227033,-35.7297953 | 211.0 | $154,465.45 | -9.622703 | -35.729795 |
3 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.622837,-35.719556 | 99.0 | $146,013.20 | -9.622837 | -35.719556 |
4 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.654955,-35.700227 | 55.0 | $101,416.71 | -9.654955 | -35.700227 |
# Use the "place_with_parent_names" column to create a "state" column for df1
df1["state"] = df1["place_with_parent_names"].str.split("|", expand=True)[2]
df1.head()
property_type | place_with_parent_names | region | lat-lon | area_m2 | price_usd | lat | lon | state | |
---|---|---|---|---|---|---|---|---|---|
0 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6443051,-35.7088142 | 110.0 | $187,230.85 | -9.644305 | -35.708814 | Alagoas |
1 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.6430934,-35.70484 | 65.0 | $81,133.37 | -9.643093 | -35.704840 | Alagoas |
2 | house | |Brasil|Alagoas|Maceió| | Northeast | -9.6227033,-35.7297953 | 211.0 | $154,465.45 | -9.622703 | -35.729795 | Alagoas |
3 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.622837,-35.719556 | 99.0 | $146,013.20 | -9.622837 | -35.719556 | Alagoas |
4 | apartment | |Brasil|Alagoas|Maceió| | Northeast | -9.654955,-35.700227 | 55.0 | $101,416.71 | -9.654955 | -35.700227 | Alagoas |
# Transform the "price_usd" column of df1 so that all values are floating-point numbers instead of strings
df1["price_usd"] = (
df1["price_usd"]
.str.replace("$","", regex=False)
.str.replace(",","")
.astype(float)
)
df1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 11551 entries, 0 to 12833 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 property_type 11551 non-null object 1 place_with_parent_names 11551 non-null object 2 region 11551 non-null object 3 lat-lon 11551 non-null object 4 area_m2 11551 non-null float64 5 price_usd 11551 non-null float64 6 lat 11551 non-null float64 7 lon 11551 non-null float64 8 state 11551 non-null object dtypes: float64(4), object(5) memory usage: 902.4+ KB
# Drop the "lat-lon" and "place_with_parent_names" columns from df1
df1.drop(columns=["place_with_parent_names", "lat-lon"], inplace=True)
df1.head()
property_type | region | area_m2 | price_usd | lat | lon | state | |
---|---|---|---|---|---|---|---|
0 | apartment | Northeast | 110.0 | 187230.85 | -9.644305 | -35.708814 | Alagoas |
1 | apartment | Northeast | 65.0 | 81133.37 | -9.643093 | -35.704840 | Alagoas |
2 | house | Northeast | 211.0 | 154465.45 | -9.622703 | -35.729795 | Alagoas |
3 | apartment | Northeast | 99.0 | 146013.20 | -9.622837 | -35.719556 | Alagoas |
4 | apartment | Northeast | 55.0 | 101416.71 | -9.654955 | -35.700227 | Alagoas |
# Import the CSV file "data/brasil-real-estate-2.csv" into the DataFrame (df2)
df2 = pd.read_csv("data/brasil-real-estate-2.csv")
# Assess df2 using the info and head methods
df2.info()
df2.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12833 entries, 0 to 12832 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 property_type 12833 non-null object 1 state 12833 non-null object 2 region 12833 non-null object 3 lat 12833 non-null float64 4 lon 12833 non-null float64 5 area_m2 11293 non-null float64 6 price_brl 12833 non-null float64 dtypes: float64(4), object(3) memory usage: 701.9+ KB
property_type | state | region | lat | lon | area_m2 | price_brl | |
---|---|---|---|---|---|---|---|
0 | apartment | Pernambuco | Northeast | -8.134204 | -34.906326 | 72.0 | 414222.98 |
1 | apartment | Pernambuco | Northeast | -8.126664 | -34.903924 | 136.0 | 848408.53 |
2 | apartment | Pernambuco | Northeast | -8.125550 | -34.907601 | 75.0 | 299438.28 |
3 | apartment | Pernambuco | Northeast | -8.120249 | -34.895920 | 187.0 | 848408.53 |
4 | apartment | Pernambuco | Northeast | -8.142666 | -34.906906 | 80.0 | 464129.36 |
# Use the "price_brl" column to create a new column named "price_usd"
df2["price_usd"] = (df2["price_brl"] / 3.19).round(2) # When this data was collected in 2015 and 2016, a US dollar cost 3.19 Brazilian reals
df2.head()
property_type | state | region | lat | lon | area_m2 | price_brl | price_usd | |
---|---|---|---|---|---|---|---|---|
0 | apartment | Pernambuco | Northeast | -8.134204 | -34.906326 | 72.0 | 414222.98 | 129850.46 |
1 | apartment | Pernambuco | Northeast | -8.126664 | -34.903924 | 136.0 | 848408.53 | 265958.79 |
2 | apartment | Pernambuco | Northeast | -8.125550 | -34.907601 | 75.0 | 299438.28 | 93867.80 |
3 | apartment | Pernambuco | Northeast | -8.120249 | -34.895920 | 187.0 | 848408.53 | 265958.79 |
4 | apartment | Pernambuco | Northeast | -8.142666 | -34.906906 | 80.0 | 464129.36 | 145495.10 |
# Drop the "price_brl" column from df2, as well as any rows that have NaN values
df2.drop(columns=["price_brl"], inplace=True)
df2.dropna(inplace=True)
df2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 11293 entries, 0 to 12832 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 property_type 11293 non-null object 1 state 11293 non-null object 2 region 11293 non-null object 3 lat 11293 non-null float64 4 lon 11293 non-null float64 5 area_m2 11293 non-null float64 6 price_usd 11293 non-null float64 dtypes: float64(4), object(3) memory usage: 705.8+ KB
# Concatenate df1 and df2 to create a new DataFrame named df
df = pd.concat([df1, df2])
print("df shape:", df.shape)
df shape: (22844, 7)
In this section, I'll use data visualization skills to learn more about the regional differences in the Brazilian real estate market.
# Create a scatter_mapbox showing the location of the properties in df
fig = px.scatter_mapbox(
df,
lat="lat",
lon="lon",
center={"lat": -14.2, "lon": -51.9}, # Map will be centered on Brazil
width=600,
height=600,
hover_data=["price_usd"], # Display price when hovering mouse over house
)
fig.update_layout(mapbox_style="open-street-map")
fig.show()
# Create a DataFrame (summary_stats) with the summary statistics for the "area_m2" and "price_usd" columns
summary_stats = df[["area_m2", "price_usd"]].describe()
summary_stats
area_m2 | price_usd | |
---|---|---|
count | 22844.000000 | 22844.000000 |
mean | 115.020224 | 194987.315515 |
std | 47.742932 | 103617.682979 |
min | 53.000000 | 74892.340000 |
25% | 76.000000 | 113898.770000 |
50% | 103.000000 | 165697.555000 |
75% | 142.000000 | 246900.882500 |
max | 252.000000 | 525659.720000 |
# Create a histogram of "price_usd"
plt.hist(df["price_usd"])
plt.xlabel("Price [USD]") # x-axis has the label "Price [USD]"
plt.ylabel("Frequency"); # y-axis has the label "Frequency"
plt.title("Distribution of Home Sizes"); # plot has the title "Distribution of Home Prices"
# Create a horizontal boxplot of "area_m2"
plt.boxplot(df["area_m2"], vert=False)
plt.xlabel("Area [sq meters]") # x-axis has the label "Area [sq meters]"
plt.title("Distribution of Home Sizes"); # plot has the title "Distribution of Home Sizes"
# Create a Series (via Groupby) named "mean_price_by_region" that shows the mean home price in each region in Brazil
mean_price_by_region = df.groupby("region")["price_usd"].mean().sort_values(ascending=True) # Sort from smallest to largest
mean_price_by_region
region Central-West 178596.283663 North 181308.958188 Northeast 185422.985482 South 189012.345360 Southeast 208996.762761 Name: price_usd, dtype: float64
# Use "mean_price_by_region" to create a bar chart
mean_price_by_region.plot(
kind="bar",
xlabel="Region", # label the x-axis as "Region"
ylabel="Mean Price [USD]", # y-axis as "Mean Price [USD]"
title="Mean Home Price by Region" # Title "Mean Home Price by Region"
);
Now I can shift my focus to the Southern region of Brazil, and look at the relationship between home size and price.
# Create a DataFrame (df_south) that contains all the homes from df that are in the "South" region
df_south = df[df["region"] == "South"]
df_south.head()
property_type | region | area_m2 | price_usd | lat | lon | state | |
---|---|---|---|---|---|---|---|
9304 | apartment | South | 127.0 | 296448.85 | -25.455704 | -49.292918 | Paraná |
9305 | apartment | South | 104.0 | 219996.25 | -25.455704 | -49.292918 | Paraná |
9306 | apartment | South | 100.0 | 194210.50 | -25.460236 | -49.293812 | Paraná |
9307 | apartment | South | 77.0 | 149252.94 | -25.460236 | -49.293812 | Paraná |
9308 | apartment | South | 73.0 | 144167.75 | -25.460236 | -49.293812 | Paraná |
# Create a Series (via value_counts) "homes_by_state" that contains the number of properties in each state in df_south
homes_by_state = df_south["state"].value_counts().head(10)
homes_by_state
Rio Grande do Sul 2643 Santa Catarina 2634 Paraná 2544 Name: state, dtype: int64
# Create a scatter plot showing price vs. area for the state in df_south that has the largest number of properties
Rio_Grande = df[df["state"] == "Rio Grande do Sul"]
plt.scatter(x=Rio_Grande["area_m2"],y=Rio_Grande["price_usd"]);
plt.xlabel("Area [sq meters]") # label the x-axis "Area [sq meters]"
plt.ylabel("Price [USD]") # y-axis "Price [USD]"
plt.title("Rio Grande do Sul: Price vs. Area"); # title "Rio Grande do Sul: Price vs. Area"
# Create a dictionary "south_states_corr", where the keys are the names of the three states in the "South" region and their associated values are the correlation coefficient between "area_m2" and "price_usd" in that state
south_states_corr = {}
df_south["area_m2"].corr(df_south["price_usd"])
south_states_corr
{}
south_states_corr = {}
south_states = df[df["region"] == 'South']["state"].unique()
for state in south_states:
df_state = df[df["state"] == state]
p_corr = df_state["area_m2"].corr(df_state["price_usd"])
south_states_corr[state] = p_corr
south_states_corr
{'Paraná': 0.5436659935502657, 'Rio Grande do Sul': 0.5773267433871903, 'Santa Catarina': 0.5068121769989855}