__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import seaborn as sns
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Read the file "data/SCFP2019.csv.gz" into the DataFrame df
def wrangle(filepath):

    """Read SCF data file into ``DataFrame``.


    Parameters
    ----------
    filepath : str
        Location of CSV file.
    """
    # Load data
    df = pd.read_csv(filepath)

    return df


df = wrangle("data/SCFP2019.csv.gz")
print("df shape:", df.shape)
df.head()

df shape: (28885, 351)


# Calculate the percentage of respondents in df that are business owners, and assign the result to the variable pct_biz_owners
# Mask = df["HBUS"] == 1
pct_biz_owners = (sum(df["HBUS"])) / (sum(df["HBUS"] == 0) + sum(df["HBUS"]))
print("% of business owners in df:", pct_biz_owners)

% of business owners in df: 0.2740176562229531


# Create a DataFrame df_inccat that shows the normalized frequency for income categories for business owners and non-business owners
inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (
    df["INCCAT"]
    .replace(inccat_dict)
    .groupby(df["HBUS"])
    .value_counts(normalize=True)
    .rename("frequency")
    .to_frame()
    .reset_index()

)

df_inccat


# Using seaborn, create a side-by-side bar chart of df_inccat
sns.barplot(
    x="INCCAT",  # Income categories are in the correct order along the x-axis
    y="frequency",
    hue="HBUS",
    data=df_inccat,
    order=inccat_dict.values()
)
plt.xlabel("Income Category") # Label the x-axis "Income Category"
plt.ylabel("Frequency (%)") # Label the y-axis "Frequency (%)"
plt.title("Income Distribution: Business Owners vs. Non-Business Owners"); # Title "Income Distribution: Business Owners vs. Non-Business Owners"


X = df[["DEBT", "HOUSES"]]
print(X.shape)
X.head()

(28885, 2)


# Build model
model = KMeans(n_clusters=3, random_state=42)
# Fit model to data
model.fit(X)

KMeans(n_clusters=3, random_state=42)

KMeans(n_clusters=3, random_state=42)


labels = model.labels_
labels[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)


labels = model.labels_
labels[-10:]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)


# Plot "HOUSES" vs "DEBT" with hue=label
sns.scatterplot(
    x=df["HOUSES"],
    y=df["DEBT"],
    hue=labels,
    palette="deep"
)
plt.xlabel("Household Debt") # Label the x-axis "Household Debt"
plt.ylabel("Home Value") # Label the y-axis "Home Value"
plt.title("Home Value vs. Household Debt"); # Title "Home Value vs. Household Debt"


# Create a new DataFrame df_small_biz that contains only business owners whose income is below $500,000
mask = (df["HBUS"] == 1) & (df["INCOME"] < 5e5)
df_small_biz = df[mask]
print("df_small_biz shape:", df_small_biz.shape)
df_small_biz.head()

df_small_biz shape: (4364, 351)


# Create a histogram from the "AGE" column in df_small_biz with 10 bins
df_small_biz["AGE"].hist(bins=10)
plt.xlabel("Age") # Label the x-axis "Age"
plt.ylabel("Frequency (count)") # Label the "Frequency (count)"
plt.title("Small Business Owners: Age Distribution"); # Title "Small Business Owners: Age Distribution"


# Calculate the variance for all the features in df_small_biz, and create a Series top_ten_var with the 10 features with the largest variance
top_ten_var = df_small_biz.var().sort_values().tail(10)
top_ten_var

EQUITY      1.005088e+13
FIN         2.103228e+13
KGBUS       5.025210e+13
ACTBUS      5.405021e+13
BUS         5.606717e+13
KGTOTAL     6.120760e+13
NHNFIN      7.363197e+13
NFIN        9.244074e+13
NETWORTH    1.424450e+14
ASSET       1.520071e+14
dtype: float64


df_small_biz["DEBT"].var()

649110694613.4797


trimmed_var?


trimmed_var(df_small_biz["DEBT"])

23255523760.72293


df_small_biz.var()

YY1             2.674590e+06
Y1              2.674591e+08
WGT             7.099420e+06
HHSEX           6.913755e-02
AGE             1.889664e+02
                    ...     
NWPCTLECAT      6.478679e+00
INCPCTLECAT     7.977168e+00
NINCPCTLECAT    7.255083e+00
INCQRTCAT       9.258707e-01
NINCQRTCAT      8.509552e-01
Length: 351, dtype: float64


df_small_biz.apply(trimmed_var)

YY1             1.685205e+06
Y1              1.685205e+08
WGT             3.938610e+06
HHSEX           0.000000e+00
AGE             9.257535e+01
                    ...     
NWPCTLECAT      2.758027e+00
INCPCTLECAT     3.947471e+00
NINCPCTLECAT    3.673245e+00
INCQRTCAT       4.812095e-01
NINCQRTCAT      4.521251e-01
Length: 351, dtype: float64


df_small_biz.apply(trimmed_var, limits=(0.1, 0.1))

YY1             1.685205e+06
Y1              1.685205e+08
WGT             3.938610e+06
HHSEX           0.000000e+00
AGE             9.257535e+01
                    ...     
NWPCTLECAT      2.758027e+00
INCPCTLECAT     3.947471e+00
NINCPCTLECAT    3.673245e+00
INCQRTCAT       4.812095e-01
NINCQRTCAT      4.521251e-01
Length: 351, dtype: float64


df_small_biz.apply(trimmed_var, limits=(0.1, 0.1)).sort_values()

HOTHMA      0.000000e+00
HCALL       0.000000e+00
PAYILN1     0.000000e+00
HLIQ        0.000000e+00
PAYEDU7     0.000000e+00
                ...     
BUS         6.531708e+11
NHNFIN      1.109187e+12
NFIN        1.792707e+12
NETWORTH    3.726356e+12
ASSET       3.990101e+12
Length: 351, dtype: float64


df_small_biz.apply(trimmed_var, limits=(0.1, 0.1)).sort_values().tail(10)

EQUITY      1.177020e+11
KGBUS       1.838163e+11
FIN         3.588855e+11
KGTOTAL     5.367878e+11
ACTBUS      5.441806e+11
BUS         6.531708e+11
NHNFIN      1.109187e+12
NFIN        1.792707e+12
NETWORTH    3.726356e+12
ASSET       3.990101e+12
dtype: float64


# Calculate trimmed variance
top_ten_trim_var = df_small_biz.apply(trimmed_var, limits=(0.1, 0.1)).sort_values().tail(10)
top_ten_trim_var

EQUITY      1.177020e+11
KGBUS       1.838163e+11
FIN         3.588855e+11
KGTOTAL     5.367878e+11
ACTBUS      5.441806e+11
BUS         6.531708e+11
NHNFIN      1.109187e+12
NFIN        1.792707e+12
NETWORTH    3.726356e+12
ASSET       3.990101e+12
dtype: float64


# Use plotly express to create a horizontal bar chart of top_ten_trim_var
fig = px.bar(
    x=top_ten_trim_var,
    y=top_ten_trim_var.index,
    title="Small Business Owners: High Variance Features" # Title "Small Business Owners: High Variance Features"
)
fig.update_layout(xaxis_title="Trimmed Variance [$]", yaxis_title="Feature") # Label x-axis "Trimmed Variance [$]" & y-axis "Feature"


top_ten_trim_var

EQUITY      1.177020e+11
KGBUS       1.838163e+11
FIN         3.588855e+11
KGTOTAL     5.367878e+11
ACTBUS      5.441806e+11
BUS         6.531708e+11
NHNFIN      1.109187e+12
NFIN        1.792707e+12
NETWORTH    3.726356e+12
ASSET       3.990101e+12
dtype: float64


top_ten_trim_var.tail(5)

BUS         6.531708e+11
NHNFIN      1.109187e+12
NFIN        1.792707e+12
NETWORTH    3.726356e+12
ASSET       3.990101e+12
dtype: float64


top_ten_trim_var.tail(5).index

Index(['BUS', 'NHNFIN', 'NFIN', 'NETWORTH', 'ASSET'], dtype='object')


top_ten_trim_var.tail(5).index.to_list()

['BUS', 'NHNFIN', 'NFIN', 'NETWORTH', 'ASSET']


high_var_cols = top_ten_trim_var.tail(5).index.to_list()
high_var_cols

['BUS', 'NHNFIN', 'NFIN', 'NETWORTH', 'ASSET']


# Create the feature matrix X
X = df_small_biz[high_var_cols] # Contains the five columns in high_var_cols
print("X shape:", X.shape)
X.head()

X shape: (4364, 5)


n_clusters = range(2, 13)
inertia_errors = []
silhouette_scores = []

# Add `for` loop to train model and calculate inertia, silhouette score.
for k in n_clusters:
    # Build model 
    model = make_pipeline(StandardScaler(), KMeans(n_clusters=k, random_state=42)) # Set the random state for my model to 42 for reproducibility
    # Train model 
    model.fit(X)
    # Calculate intertia
    inertia_errors.append(model.named_steps["kmeans"].inertia_)
    # Calculate silhouette score
    silhouette_scores.append(
        silhouette_score(X, model.named_steps["kmeans"].labels_)
    )

print("Inertia:", inertia_errors[:11])
print()
print("Silhouette Scores:", silhouette_scores[:3])

Inertia: [5765.863949365074, 3070.4294488357577, 2220.2921850896846, 1777.4635570665591, 1443.786007103407, 1173.3701169575006, 1004.0082329287375, 892.7197264630447, 780.7646441851746, 678.9317940468634, 601.010706235275]

Silhouette Scores: [0.9542706303253067, 0.8446503900103915, 0.7422220122162623]


# Use plotly express to create a line plot that shows the values of inertia_errors as a function of n_clusters
fig = px.line(
    x=n_clusters, y=inertia_errors, title="K-Means Model: Inertia vs Number of Clusters" # Label x-axis "Number of Clusters" & y-axis "Inertia" 
)
fig.update_layout(xaxis_title="Number of Clusters", yaxis_title="Inertia") # Title "K-Means Model: Inertia vs Number of Clusters"


# Use plotly express to create a line plot that shows the values of silhouette_scores as a function of n_clusters
fig = px.line(
    x=n_clusters,
    y=silhouette_scores,
    title="K-Means Model: Silhouette Score vs Number of Clusters" # Title "K-Means Model: Silhouette Score vs Number of Clusters"
)
fig.update_layout(
    xaxis_title="Number of Clusters", yaxis_title="Silhouette Score")  # Label x-axis "Number of Clusters" & y-axis "Silhouette Score"


# Build and train a new k-means model named final_model
final_model = make_pipeline(
    StandardScaler(),
    KMeans(n_clusters=3, random_state=42) # The number of clusters should be 3
)
final_model.fit(X)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=3, random_state=42))])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=3, random_state=42))])

StandardScaler()

KMeans(n_clusters=3, random_state=42)


labels = final_model.named_steps["kmeans"].labels_
print(labels[:5])

[0 0 0 0 0]


xgb = X.groupby(labels).mean()
xgb


# Create a DataFrame xgb that contains the mean values of the features in X for the 3 clusters in my final_model
labels = final_model.named_steps["kmeans"].labels_
xgb = X.groupby(labels).mean()
xgb


# Use plotly express to create a side-by-side bar chart from xgb that shows the mean of the features in X for each of the clusters in my final_model
fig = px.bar(
    xgb,
    barmode="group",
    title="Small Business Owner Finances by Cluster" # Title "Small Business Owner Finances by Cluster"
)

fig.update_layout(xaxis_title="Cluster", yaxis_title="Value [$]") # Label the x-axis "Cluster" & the y-axis "Value [$]"


# Instantiate transformer
pca = PCA(n_components=2, random_state=42)

# Transform `X`
X_t = pca.fit_transform(X)

# Put `X_t` into DataFrame
X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"] )

print("X_pca shape:", X_pca.shape)
X_pca.head()

X_pca shape: (4364, 2)


labels

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)


labels.astype(str)

array(['0', '0', '0', ..., '0', '0', '0'], dtype='<U11')


# Use plotly express to create a scatter plot of X_pca using seaborn
fig = px.scatter(
    data_frame=X_pca,
    x="PC1", # Label the x-axis "PC1"
    y="PC2", # Label the y-axis "PC2" 
    color=labels, 
    title="PCA Representation of Clusters" # Title "PCA Representation of Clusters"
)
fig.update_layout(xaxis_title="PC1", yaxis_title="PC2")
fig.show()

	NHNFIN	NFIN	NETWORTH	ASSET
80	224000.0	724000.0	237600.0	810600.0
81	223000.0	723000.0	236600.0	809600.0
82	224000.0	724000.0	237600.0	810600.0
83	222000.0	722000.0	234600.0	808600.0
84	223000.0	723000.0	237600.0	809600.0

	BUS	NHNFIN	NFIN	NETWORTH	ASSET
0	7.367185e+05	1.002199e+06	1.487967e+06	2.076003e+06	2.281249e+06
1	6.874479e+07	8.202115e+07	9.169652e+07	1.134843e+08	1.167529e+08
2	1.216152e+07	1.567619e+07	1.829123e+07	2.310024e+07	2.422602e+07

	BUS	NHNFIN	NFIN	NETWORTH	ASSET
0	7.367185e+05	1.002199e+06	1.487967e+06	2.076003e+06	2.281249e+06
1	6.874479e+07	8.202115e+07	9.169652e+07	1.134843e+08	1.167529e+08
2	1.216152e+07	1.567619e+07	1.829123e+07	2.310024e+07	2.422602e+07

	PC1	PC2
0	-6.220648e+06	-503841.638840
1	-6.222523e+06	-503941.888901
2	-6.220648e+06	-503841.638839
3	-6.224927e+06	-504491.429465
4	-6.221994e+06	-503492.598399

Prepare Data¶

Import¶

Explore¶

Split¶

Build Model¶

Iterate¶

Communicate¶

	YY1	Y1	WGT	HHSEX	AGE	AGECL	EDUC	EDCL	MARRIED	...	NWCAT	INCCAT	ASSETCAT	NINCCAT	NINC2CAT	NWPCTLECAT	INCPCTLECAT	NINCPCTLECAT	INCQRTCAT	NINCQRTCAT
0	1	11	6119.779308	2	75	6	12	4	2	...	5	3	6	3	2	10	6	6	3	3
1	1	12	4712.374912	2	75	6	12	4	2	...	5	3	6	3	1	10	5	5	2	2
2	1	13	5145.224455	2	75	6	12	4	2	...	5	3	6	3	1	10	5	5	2	2
3	1	14	5297.663412	2	75	6	12	4	2	...	5	2	6	2	1	10	4	4	2	2
4	1	15	4761.812371	2	75	6	12	4	2	...	5	3	6	3	1	10	5	5	2	2

	HBUS	INCCAT	frequency
0	0	0-20	0.210348
1	0	21-39.9	0.198140
2	0	40-59.9	0.189080
3	0	60-79.9	0.186600
4	0	90-100	0.117167
5	0	80-89.9	0.098665
6	1	90-100	0.629438
7	1	60-79.9	0.119015
8	1	80-89.9	0.097410
9	1	40-59.9	0.071510
10	1	21-39.9	0.041440
11	1	0-20	0.041188

	DEBT	HOUSES
0	0.0	1100000.0
1	0.0	1100000.0
2	0.0	1100000.0
3	0.0	1100000.0
4	0.0	1100000.0

	YY1	Y1	WGT	HHSEX	AGE	AGECL	EDUC	EDCL	MARRIED	...	NWCAT	INCCAT	ASSETCAT	NINCCAT	NINC2CAT	NWPCTLECAT	INCPCTLECAT	NINCPCTLECAT	INCQRTCAT	NINCQRTCAT
80	17	171	7802.265717	1	62	4	12	4	1	...	3	5	5	5	2	7	9	9	4	4
81	17	172	8247.536301	1	62	4	12	4	1	...	3	5	5	5	2	7	9	9	4	4
82	17	173	8169.562719	1	62	4	12	4	1	...	3	5	5	5	2	7	9	9	4	4
83	17	174	8087.704517	1	62	4	12	4	1	...	3	5	5	5	2	7	9	9	4	4
84	17	175	8276.510048	1	62	4	12	4	1	...	3	5	5	5	2	7	9	9	4	4