__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


# Create a wrangle function that takes a path of a CSV file as input, reads the file into a DataFrame, subsets the data to households that have been turned down for credit or feared being denied credit in the past 5 years (see "TURNFEAR"), and returns the subset DataFrame
def wrangle(filepath):
    df = pd.read_csv(filepath)
    mask = df["TURNFEAR"] == 1
    df = df[mask]
    return df


# Use my wrangle function to read the file SCFP2019.csv.gz into a DataFrame named df
df = wrangle("data/SCFP2019.csv.gz")
print(df.shape)
df.head()

(4623, 351)


df["DEBT"].head()

5    12200.0
6    12600.0
7    15300.0
8    14100.0
9    15400.0
Name: DEBT, dtype: float64


(df["DEBT"] / 1_000_000).head()

5    0.0122
6    0.0126
7    0.0153
8    0.0141
9    0.0154
Name: DEBT, dtype: float64


(df["DEBT"] /1e6).head()

5    0.0122
6    0.0126
7    0.0153
8    0.0141
9    0.0154
Name: DEBT, dtype: float64


sns.scatterplot(x=df["DEBT"], y=df["HOUSES"])
plt.xlabel("Household Debt [$1M]") 
plt.ylabel("Home Value [$1M]") 
plt.title("Credit Fearful: Home Value vs. Household Debt");


# Create a scatter plot of that shows the total value of primary residence of a household ("HOUSES") as a function of the total value of household debt ("DEBT")
sns.scatterplot(x=df["DEBT"]/1e6, y=df["HOUSES"]/1e6)
plt.xlabel("Household Debt [$1M]") # Label x-axis as "Household Debt"
plt.ylabel("Home Value [$1M]") # Label y-axis as "Home Value"
plt.title("Credit Fearful: Home Value vs. Household Debt"); # Title "Credit Fearful: Home Value vs. Household Debt"


# Create the feature matrix X
X = df[["DEBT", "HOUSES"]] # Should contain two features only: "DEBT" and "HOUSES"
print(X.shape)
X.head()

(4623, 2)


cw = ClusterWidget(n_clusters=3)
cw.show()

VBox(children=(IntSlider(value=0, continuous_update=False, description='Step:', max=10), Output(layout=Layout(…


scfc = SCFClusterWidget(x=df["DEBT"], y=df["HOUSES"], n_clusters=3)
scfc.show()

VBox(children=(IntSlider(value=0, continuous_update=False, description='Step:', max=10), Output(layout=Layout(…


# Build model
model = KMeans(n_clusters=3, random_state=42)
# Fit model to data
model.fit(X)

KMeans(n_clusters=3, random_state=42)

KMeans(n_clusters=3, random_state=42)


# Extract the labels that my model created during training and assign them to the variable labels
labels = model.labels_
labels[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)


labels = model.labels_
labels[-10:]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)


# Plot "HOUSES" vs "DEBT" with hue=label
sns.scatterplot(
    x=df["DEBT"]/1e6,
    y=df["HOUSES"]/1e6,
    hue=labels,
    palette="deep"
)
plt.xlabel("Household Debt [$1M]")
plt.ylabel("Home Value [$1M]")
plt.title("Credit Fearful: Home Value vs. Household Debt");


# Extract the centroids that my model created during training, and assign them to the variable centroids
centroids = model.cluster_centers_
centroids

array([[   91017.57766674,   116150.29328698],
       [18384100.        , 34484000.        ],
       [ 5065800.        , 11666666.66666667]])


# Plot "HOUSES" vs "DEBT", add centroids
sns.scatterplot(
    x=df["DEBT"]/1e6,
    y=df["HOUSES"]/1e6,
    hue=labels,
    palette="deep"
)
plt.scatter(
    x=centroids[:, 0]/1e6,
    y=centroids[:, 1]/1e6,
    color="gray",
    marker="*",
    s=150
)
plt.xlabel("Household Debt [$1M]")
plt.ylabel("Home Value [$1M]")
plt.title("Credit Fearful: Home Value vs. Household Debt");


# Extract the inertia for my model and assign it to the variable inertia
inertia = model.inertia_
print("Inertia (3 clusters):", inertia)

Inertia (3 clusters): 939554010797059.4


# Calculate the silhouette score for my model and assign it to the variable ss
ss = silhouette_score(X, model.labels_)
print("Silhouette Score (3 clusters):", ss)

Silhouette Score (3 clusters): 0.9768842462944348


n_clusters = range(2, 13)
inertia_errors = []
silhouette_scores = []

# Add `for` loop to train model and calculate inertia, silhouette score.
for k in n_clusters:
    # Build model
    model = KMeans(n_clusters=k, random_state=42)
    # Train model
    model.fit(X)
    # Calculate inertia
    inertia_errors.append(model.inertia_)
    # Calculate ss
    silhouette_scores.append(silhouette_score(X, model.labels_))


print("Inertia:", inertia_errors)
print()
print("Silhouette Scores:", silhouette_scores)

Inertia: [3018038313336857.5, 939554010797059.4, 546098841715646.25, 309310386410913.3, 235243397481784.3, 182225729179703.53, 150670779013790.4, 114321995931021.89, 100340259483919.02, 86229997033602.88, 74757234072100.36]

Silhouette Scores: [0.9855099957519555, 0.9768842462944348, 0.9490311483406091, 0.839330043242819, 0.7287406719898627, 0.726989114305748, 0.7263840026889208, 0.7335125606476427, 0.692157992955073, 0.6949309528556856, 0.6951831031001252]


# Create a line plot that shows the values of inertia_errors as a function of n_clusters
plt.plot(n_clusters, inertia_errors)
plt.xlabel("Number of Clusters (k)") # Label x-axis "Number of Clusters"
plt.ylabel("Inertia") # Label y-axis "Inertia"
plt.title("K-Means Model: Inertia vs Number of Clusters"); # Title "K-Means Model: Inertia vs Number of Clusters"


# Create a line plot that shows the values of silhouette_scores as a function of n_clusters
plt.plot(n_clusters, silhouette_scores)
plt.xlabel("Number of Clusters (k)")  # Label x-axis "Number of Clusters"
plt.ylabel("Silhouette Score")    # Label y-axis "Silhouette Score"
plt.title("K-Means Model: Silhouette Score vs Number of Clusters"); # Title "K-Means Model: Silhouette Score vs Number of Clusters"


# Build model
final_model = KMeans(n_clusters=4, random_state=42)
# Fit model to data
final_model.fit(X)

KMeans(n_clusters=4, random_state=42)

KMeans(n_clusters=4, random_state=42)


final_model.algorithm

'lloyd'


# Plot "HOUSES" vs "DEBT" with final_model labels
sns.scatterplot(
    x=df["DEBT"],
    y=df["HOUSES"],
    hue=final_model.labels_,
    palette="deep"
)
plt.xlabel("Household Debt [$1M]")
plt.ylabel("Home Value [$1M]")
plt.title("Credit Fearful: Home Value vs. Household Debt");


xgb = X.groupby(final_model.labels_)
xgb

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fa4ccd273d0>


# Create a DataFrame xgb that contains the mean "DEBT" and "HOUSES" values for each of the clusters in my final_model
xgb = X.groupby(final_model.labels_).mean()
xgb


final_model.cluster_centers_

array([[   84886.28951384,   103187.22476563],
       [18384100.        , 34484000.        ],
       [ 5472800.        , 14074000.        ],
       [ 2420928.57142857,  4551428.57142857]])


xgb.plot(kind="bar")
plt.xlabel("Cluster")
plt.ylabel("Value [$1 million]")
plt.title("Mean Home Value & Household Debt by Cluster");


# Create a side-by-side bar chart from xgb that shows the mean "DEBT" and "HOUSES" values for each of the clusters in my final_model
(xgb/1e6).plot(kind="bar")  # Divide the values in xgb by 1 million for readability
plt.xlabel("Cluster") # Label the x-axis "Cluster"
plt.ylabel("Value [$1 million]") # Label the y-axis "Value [$1 million]"
plt.title("Mean Home Value & Household Debt by Cluster");


(xgb["DEBT"] / xgb["HOUSES"]).plot(
    kind="bar",
    xlabel="Cluster",
    ylabel="Proportion, Debt/Home",
    title="Proportion of Debt to Home Value"
);

	YY1	Y1	WGT	HHSEX	AGE	AGECL	EDUC	EDCL	MARRIED	KIDS	...	NWCAT	INCCAT	ASSETCAT	NINCCAT	NINC2CAT	NWPCTLECAT	INCPCTLECAT	NINCPCTLECAT	INCQRTCAT	NINCQRTCAT
5	2	21	3790.476607	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2
6	2	22	3798.868505	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	3	2	2
7	2	23	3799.468393	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2
8	2	24	3788.076005	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2
9	2	25	3793.066589	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2

	DEBT	HOUSES
0	8.488629e+04	1.031872e+05
1	1.838410e+07	3.448400e+07
2	5.472800e+06	1.407400e+07
3	2.420929e+06	4.551429e+06

Prepare Data¶

Import¶

Explore¶

Split¶

Build Model¶

Iterate¶

Communicate¶