__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import pandas as pd
import plotly.express as px

from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


def wrangle(filepath):
    # Read file into DataFrame
    df = pd.read_csv(filepath)
    mask = (df["TURNFEAR"] == 1) & (df["NETWORTH"] < 2e6) 
    df = df[mask]
    return df


df = wrangle("data/SCFP2019.csv.gz")
print(df.shape)
df.head()

(4418, 351)


x = df["DEBT"]
x.head()

5    12200.0
6    12600.0
7    15300.0
8    14100.0
9    15400.0
Name: DEBT, dtype: float64


x.var()

18482520920.408085


x.mean()

72701.25848800362


(x - x.mean()).head()

5   -60501.258488
6   -60101.258488
7   -57401.258488
8   -58601.258488
9   -57301.258488
Name: DEBT, dtype: float64


((x - x.mean())**2).head()

5    3.660402e+09
6    3.612161e+09
7    3.294904e+09
8    3.434107e+09
9    3.283434e+09
Name: DEBT, dtype: float64


sum((x - x.mean())**2)

81637294905442.52


sum((x - x.mean())**2) / (len(x) - 1)

18482520920.408085


df.var()

YY1             2.876461e+06
Y1              2.876460e+08
WGT             4.808436e+06
HHSEX           2.192779e-01
AGE             2.154642e+02
                    ...     
NWPCTLECAT      5.178676e+00
INCPCTLECAT     6.070208e+00
NINCPCTLECAT    6.136292e+00
INCQRTCAT       9.512230e-01
NINCQRTCAT      9.624869e-01
Length: 351, dtype: float64


df.var().sort_values()

PAYILN7     0.000000e+00
PAYLCO      0.000000e+00
PAYEDU7     0.000000e+00
PAYILN5     0.000000e+00
PAYILN6     0.000000e+00
                ...     
NHNFIN      2.254163e+10
HOUSES      2.388459e+10
NETWORTH    4.847029e+10
NFIN        5.713939e+10
ASSET       8.303967e+10
Length: 351, dtype: float64


df.var().sort_values().tail(10)

PLOAN1      1.140894e+10
ACTBUS      1.251892e+10
BUS         1.256643e+10
KGTOTAL     1.346475e+10
DEBT        1.848252e+10
NHNFIN      2.254163e+10
HOUSES      2.388459e+10
NETWORTH    4.847029e+10
NFIN        5.713939e+10
ASSET       8.303967e+10
dtype: float64


# Calculate the variance for all the features in df, and create a Series top_ten_var with the 10 features with the largest variance
top_ten_var = df.var().sort_values().tail(10)
top_ten_var

PLOAN1      1.140894e+10
ACTBUS      1.251892e+10
BUS         1.256643e+10
KGTOTAL     1.346475e+10
DEBT        1.848252e+10
NHNFIN      2.254163e+10
HOUSES      2.388459e+10
NETWORTH    4.847029e+10
NFIN        5.713939e+10
ASSET       8.303967e+10
dtype: float64


# Use plotly express to create a horizontal bar chart of "top_ten_var"
fig = px.bar(
    x=top_ten_var,
    y=top_ten_var.index,
    title="SCF: High Variance Features" # Title "SCF: High Variance Features"
)
fig.update_layout(xaxis_title="Variance", yaxis_title="Feature")  # Label x-axis "Variance" & y-axis "Feature"
fig.show()


# Use plotly express to create a horizontal boxplot of "NHNFIN" to determine if the values are skewed
fig = px.box(
    data_frame=df,
    x="NHNFIN", 
    title="Distribution of Non-home, Non-Financial Assets", # Title "Distribution of Non-home, Non-Financial Assets"
)
fig.update_layout(xaxis_title="Value [$]")  # Label the x-axis "Value [$]"
fig.show()


df["DEBT"].var()

18482520920.408085


trimmed_var?

Signature:
trimmed_var(
    a,
    limits=(0.1, 0.1),
    inclusive=(1, 1),
    relative=True,
    axis=None,
    ddof=0,
)
Docstring:
Returns the trimmed variance of the data along the given axis.


Parameters
----------
a : sequence
    Input array
limits : {None, tuple}, optional
    If `relative` is False, tuple (lower limit, upper limit) in absolute values.
    Values of the input array lower (greater) than the lower (upper) limit are
    masked.

    If `relative` is True, tuple (lower percentage, upper percentage) to cut
    on each side of the  array, with respect to the number of unmasked data.

    Noting n the number of unmasked data before trimming, the (n*limits[0])th
    smallest data and the (n*limits[1])th largest data are masked, and the
    total number of unmasked data after trimming is n*(1.-sum(limits))
    In each case, the value of one limit can be set to None to indicate an
    open interval.

    If limits is None, no trimming is performed
inclusive : {(bool, bool) tuple}, optional
    If `relative` is False, tuple indicating whether values exactly equal
    to the absolute limits are allowed.
    If `relative` is True, tuple indicating whether the number of data
    being masked on each side should be rounded (True) or truncated
    (False).
relative : bool, optional
    Whether to consider the limits as absolute values (False) or proportions
    to cut (True).
axis : int, optional
    Axis along which to trim.

ddof : {0,integer}, optional
    Means Delta Degrees of Freedom. The denominator used during computations
    is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
    biased estimate of the variance.
File:      /opt/conda/lib/python3.9/site-packages/scipy/stats/mstats_basic.py
Type:      function


trimmed_var(df["DEBT"])

3089864647.655702


df.var()

YY1             2.876461e+06
Y1              2.876460e+08
WGT             4.808436e+06
HHSEX           2.192779e-01
AGE             2.154642e+02
                    ...     
NWPCTLECAT      5.178676e+00
INCPCTLECAT     6.070208e+00
NINCPCTLECAT    6.136292e+00
INCQRTCAT       9.512230e-01
NINCQRTCAT      9.624869e-01
Length: 351, dtype: float64


df.apply(trimmed_var)

YY1             1.850508e+06
Y1              1.850507e+08
WGT             1.412290e+06
HHSEX           2.019627e-01
AGE             1.139698e+02
                    ...     
NWPCTLECAT      2.580682e+00
INCPCTLECAT     3.429553e+00
NINCPCTLECAT    3.523264e+00
INCQRTCAT       5.850373e-01
NINCQRTCAT      6.002715e-01
Length: 351, dtype: float64


df.apply(trimmed_var, limits=(0.1, 0.1))

YY1             1.850508e+06
Y1              1.850507e+08
WGT             1.412290e+06
HHSEX           2.019627e-01
AGE             1.139698e+02
                    ...     
NWPCTLECAT      2.580682e+00
INCPCTLECAT     3.429553e+00
NINCPCTLECAT    3.523264e+00
INCQRTCAT       5.850373e-01
NINCQRTCAT      6.002715e-01
Length: 351, dtype: float64


df.apply(trimmed_var, limits=(0.1, 0.1)).sort_values()

HOTHMA      0.000000e+00
NOTXBND     0.000000e+00
MORTBND     0.000000e+00
GOVTBND     0.000000e+00
OBND        0.000000e+00
                ...     
DEBT        3.089865e+09
NETWORTH    3.099929e+09
HOUSES      4.978660e+09
NFIN        8.456442e+09
ASSET       1.175370e+10
Length: 351, dtype: float64


# Calculate trimmed variance
df.apply(trimmed_var, limits=(0.1, 0.1)).sort_values().tail(10)

WAGEINC     5.550737e+08
HOMEEQ      7.338377e+08
NH_MORT     1.333125e+09
MRTHEL      1.380468e+09
PLOAN1      1.441968e+09
DEBT        3.089865e+09
NETWORTH    3.099929e+09
HOUSES      4.978660e+09
NFIN        8.456442e+09
ASSET       1.175370e+10
dtype: float64


# Create a Series top_ten_trim_var with the 10 features with the largest variance
top_ten_trim_var = df.apply(trimmed_var, limits=(0.1, 0.1)).sort_values().tail(10)
top_ten_trim_var

WAGEINC     5.550737e+08
HOMEEQ      7.338377e+08
NH_MORT     1.333125e+09
MRTHEL      1.380468e+09
PLOAN1      1.441968e+09
DEBT        3.089865e+09
NETWORTH    3.099929e+09
HOUSES      4.978660e+09
NFIN        8.456442e+09
ASSET       1.175370e+10
dtype: float64


# Use plotly express to create a horizontal bar chart of top_ten_trim_var
fig = px.bar(
    x=top_ten_trim_var,
    y=top_ten_trim_var.index,
    title="SCF: High Variance Features" # Title "SCF: High Variance Features"
)
fig.update_layout(xaxis_title="Trimmed Variance", yaxis_title="Feature") # Label x-axis "Trimmed Variance" & y-axis "Feature"
fig.show()


top_ten_trim_var

WAGEINC     5.550737e+08
HOMEEQ      7.338377e+08
NH_MORT     1.333125e+09
MRTHEL      1.380468e+09
PLOAN1      1.441968e+09
DEBT        3.089865e+09
NETWORTH    3.099929e+09
HOUSES      4.978660e+09
NFIN        8.456442e+09
ASSET       1.175370e+10
dtype: float64


top_ten_trim_var.tail(5)

DEBT        3.089865e+09
NETWORTH    3.099929e+09
HOUSES      4.978660e+09
NFIN        8.456442e+09
ASSET       1.175370e+10
dtype: float64


top_ten_trim_var.tail(5).index

Index(['DEBT', 'NETWORTH', 'HOUSES', 'NFIN', 'ASSET'], dtype='object')


top_ten_trim_var.tail(5).index.to_list()

['DEBT', 'NETWORTH', 'HOUSES', 'NFIN', 'ASSET']


# Generate a list high_var_cols with the column names of the five features with the highest trimmed variance
high_var_cols = top_ten_trim_var.tail(5).index.to_list()
high_var_cols

['DEBT', 'NETWORTH', 'HOUSES', 'NFIN', 'ASSET']


# Create the feature matrix X
X = df[high_var_cols]
print("X shape:", X.shape)
X.head() #  It should contain the five columns in high_var_cols

X shape: (4418, 5)


X["DEBT"].mean()

72701.25848800362


X.mean()

DEBT         72701.258488
NETWORTH     76387.768900
HOUSES       74530.805794
NFIN        117330.637166
ASSET       149089.027388
dtype: float64


X.aggregate(["mean", "std"])


X.aggregate(["mean", "std"]).astype(int)


# Create a DataFrame X_summary with the mean and standard deviation for all the features in X
X_summary = X.aggregate(["mean", "std"]).astype(int)
X_summary


x = X["DEBT"]
x.head()

5    12200.0
6    12600.0
7    15300.0
8    14100.0
9    15400.0
Name: DEBT, dtype: float64


x = X["DEBT"]
print("mean", round(x.mean()))
print("std", round(x.std()))

mean 72701
std 135950


(x - x.mean()).head()

5   -60501.258488
6   -60101.258488
7   -57401.258488
8   -58601.258488
9   -57301.258488
Name: DEBT, dtype: float64


(x - x.mean()) / x.std()

5       -0.445024
6       -0.442082
7       -0.422222
8       -0.431049
9       -0.421486
           ...   
28865   -0.458265
28866   -0.458265
28867   -0.458265
28868   -0.458265
28869   -0.458265
Name: DEBT, Length: 4418, dtype: float64


x_scaled = (x - x.mean()) / x.std()
x_scaled.head()

5   -0.445024
6   -0.442082
7   -0.422222
8   -0.431049
9   -0.421486
Name: DEBT, dtype: float64


x_scaled = (x - x.mean()) / x.std()
print("mean", round(x_scaled.mean()))
print("std", round(x_scaled.std()))

mean 0
std 1


# Instantiate transformer
ss = StandardScaler()

# Transform `X`
X_scaled_data = ss.fit_transform(X)


type(X_scaled_data)

numpy.ndarray


# Instantiate transformer
ss = StandardScaler()

# Transform `X`
X_scaled_data = ss.fit_transform(X)

# Put `X_scaled_data` into DataFrame
X_scaled = pd.DataFrame(X_scaled_data, columns=X.columns)

print("X_scaled shape:", X_scaled.shape)
X_scaled.head()

X_scaled shape: (4418, 5)


# Create a DataFrame X_scaled_summary with the mean and standard deviation for all the features in X_scaled
X_scaled_summary = X_scaled.aggregate(["mean", "std"]).astype(int)
X_scaled_summary


n_clusters = range(2, 13)
inertia_errors = []
silhouette_scores = []

# Add `for` loop to train model and calculate inertia, silhouette score.
for k in n_clusters:
    # Build model 
    model = make_pipeline(StandardScaler(), KMeans(n_clusters=k, random_state=42))
    # Train model 
    model.fit(X)
    # Calculate intertia
    inertia_errors.append(model.named_steps["kmeans"].inertia_)
    # Calculate silhouette score
    silhouette_scores.append(
        silhouette_score(X, model.named_steps["kmeans"].labels_)
    )

print("Inertia:", inertia_errors[:3])
print()
print("Silhouette Scores:", silhouette_scores[:3])

Inertia: [11028.058082607145, 7190.526303575355, 5924.997726868041]

Silhouette Scores: [0.7464502937083215, 0.7044601307791996, 0.6962653079183132]


# Use plotly express to create a line plot that shows the values of inertia_errors as a function of n_clusters
fig = px.line(
    x=n_clusters, y=inertia_errors, title="K-Means Model: Inertia vs Number of Clusters"
)
fig.update_layout(xaxis_title="Number of Clusters (k)", yaxis_title="Inertia")  # Label x-axis "Number of Clusters", y-axis "Inertia", and title "K-Means Model: Inertia vs Number of Clusters"
fig.show()


# Use plotly express to create a line plot that shows the values of silhouette_scores as a function of n_clusters
fig = px.line(
    x=n_clusters,
    y=silhouette_scores,
    title="K-Means Model: Silhouette Score vs Number of Clusters"
)
fig.update_layout(
    xaxis_title="Number of Clusters", yaxis_title="Silhouette Score") # Label x-axis "Number of Clusters", y-axis "Silhouette Score", and title "K-Means Model: Silhouette Score vs Number of Clusters"
fig.show()


# Build and train a new k-means model named final_model
final_model = make_pipeline(
    StandardScaler(),
    KMeans(n_clusters=4, random_state=42)
)
final_model.fit(X)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=4, random_state=42))])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=4, random_state=42))])

StandardScaler()

KMeans(n_clusters=4, random_state=42)


# Extract the labels that my final_model created during training and assign them to the variable labels
labels = final_model.named_steps["kmeans"].labels_
print(labels[:5])

[0 0 0 0 0]


# Create a DataFrame xgb that contains the mean values of the features in X for each of the clusters in my final_model
xgb = X.groupby(labels).mean()
xgb


# Use plotly express to create a side-by-side bar chart from xgb that shows the mean of the features in X for each of the clusters in my final_model
fig = px.bar(
    xgb,
    barmode="group",
    title="Mean Household Finances by Cluster" # Title "Mean Household Finances by Cluster"
)

fig.update_layout(xaxis_title="Cluster", yaxis_title="Value [$]") # Label x-axis "Cluster" & the y-axis "Value [$]"
fig.show()


# Instantiate transformer
pca = PCA(n_components=2, random_state=42)

# Transform `X`
X_t = pca.fit_transform(X)

X_t

array([[-221525.42453041,  -22052.27300297],
       [-217775.10072188,  -22851.35806823],
       [-219519.64217476,  -19023.64633269],
       ...,
       [ 323306.39571853, -182741.08760824],
       [ 327948.76711284, -184322.85705512],
       [ 334191.95622931, -186450.06424231]])


# Instantiate transformer
pca = PCA(n_components=2, random_state=42)

# Transform `X`
X_t = pca.fit_transform(X)

# Put `X_t` into DataFrame
X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"] )

print("X_pca shape:", X_pca.shape)
X_pca.head()

X_pca shape: (4418, 2)


labels

array([0, 0, 0, ..., 1, 1, 1], dtype=int32)


labels.astype(str)

array(['0', '0', '0', ..., '1', '1', '1'], dtype='<U11')


# Create scatter plot of `PC2` vs `PC1`
fig = px.scatter(
    data_frame=X_pca,
    x="PC1",
    y="PC2",
    color=labels,
    title="PCA Representation of Clusters" # Title "PCA Representation of Clusters"
)
fig.update_layout(xaxis_title="PC1", yaxis_title="PC2") # Label the x-axis "PC1" & the y-axis "PC2"
fig.show()

	DEBT	NETWORTH	HOUSES	NFIN	ASSET
0	-0.445075	-0.377486	-0.48231	-0.474583	-0.498377
1	-0.442132	-0.368401	-0.48231	-0.464541	-0.490047
2	-0.422270	-0.383868	-0.48231	-0.467470	-0.492494
3	-0.431097	-0.358407	-0.48231	-0.449061	-0.477206
4	-0.421534	-0.372966	-0.48231	-0.457010	-0.483818

	DEBT	NETWORTH	HOUSES	NFIN	ASSET
0	26551.075439	13676.153182	13745.637777	2.722605e+04	4.022723e+04
1	218112.818182	174713.441558	257403.246753	3.305884e+05	3.928263e+05
2	116160.779817	965764.155963	264339.449541	7.800611e+05	1.081925e+06
3	732937.575758	760397.575758	826136.363636	1.276227e+06	1.493335e+06

	PC1	PC2
0	-221525.424530	-22052.273003
1	-217775.100722	-22851.358068
2	-219519.642175	-19023.646333
3	-212195.720367	-22957.107039
4	-215540.507551	-20259.749306

Prepare Data¶

Import¶

Explore¶

Split¶

Build Model¶

Iterate¶

Communicate¶

	YY1	Y1	WGT	HHSEX	AGE	AGECL	EDUC	EDCL	MARRIED	KIDS	...	NWCAT	INCCAT	ASSETCAT	NINCCAT	NINC2CAT	NWPCTLECAT	INCPCTLECAT	NINCPCTLECAT	INCQRTCAT	NINCQRTCAT
5	2	21	3790.476607	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2
6	2	22	3798.868505	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	3	2	2
7	2	23	3799.468393	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2
8	2	24	3788.076005	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2
9	2	25	3793.066589	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2

	DEBT	NETWORTH	NFIN	ASSET
5	12200.0	-6710.0	3900.0	5490.0
6	12600.0	-4710.0	6300.0	7890.0
7	15300.0	-8115.0	5600.0	7185.0
8	14100.0	-2510.0	10000.0	11590.0
9	15400.0	-5715.0	8100.0	9685.0

	DEBT	NETWORTH	HOUSES	NFIN	ASSET
mean	72701.258488	76387.768900	74530.805794	117330.637166	149089.027388
std	135950.435529	220159.684405	154546.415791	239038.471726	288166.040553

	DEBT	NETWORTH	HOUSES	NFIN	ASSET
mean	72701	76387	74530	117330	149089
std	135950	220159	154546	239038	288166