__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


# Read the file "data/SCFP2019.csv.gz" into the DataFrame df
df = pd.read_csv("data/SCFP2019.csv.gz")
print("df shape:", df.shape)
df.head()

df shape: (28885, 351)


# Use a mask to subset create df to only households that have been turned down or feared being turned down for credit ("TURNFEAR" == 1)
mask = df["TURNFEAR"] == 1
mask.head(10)

0    False
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
9     True
Name: TURNFEAR, dtype: bool


mask.sum()

4623


mask = df["TURNFEAR"] == 1
df_fear = df[mask]  # Assign the subset to the variable name df_fear
print("df_fear shape:", df_fear.shape)
df_fear.head()

df_fear shape: (4623, 351)


df_fear["AGECL"].head()

5    3
6    3
7    3
8    3
9    3
Name: AGECL, dtype: int64


df_fear["AGECL"].nunique()

6


# Create a list age_groups with the unique values in the "AGECL" column
age_groups = df_fear["AGECL"].unique()
print("Age Groups:", age_groups)

Age Groups: [3 5 1 2 4 6]


df_fear["AGECL"].head(10)

5      3
6      3
7      3
8      3
9      3
110    5
111    5
112    5
113    5
114    5
Name: AGECL, dtype: int64


# Create a Series agecl that contains the observations from "AGECL" using the true group names
agecl_dict = {
    1: "Under 35",
    2: "35-44",
    3: "45-54",
    4: "55-64",
    5: "65-74",
    6: "75 or Older",
}

age_cl = df_fear["AGECL"].replace(agecl_dict)
age_cl.head(10)

5      45-54
6      45-54
7      45-54
8      45-54
9      45-54
110    65-74
111    65-74
112    65-74
113    65-74
114    65-74
Name: AGECL, dtype: object


age_cl.head()

5    45-54
6    45-54
7    45-54
8    45-54
9    45-54
Name: AGECL, dtype: object


age_cl.value_counts()

Under 35       1307
35-44          1093
45-54           932
55-64           745
65-74           401
75 or Older     145
Name: AGECL, dtype: int64


# Create a bar chart showing the value counts from age_cl
age_cl_value_counts = age_cl.value_counts()

# Bar plot of `age_cl_value_counts`
age_cl.value_counts().plot(
    kind="bar",
    xlabel="Age Group",  # Label the x-axis "Age Group"
    ylabel="Frequency (count)", # Label the y-axis "Frequency (count)"
    title="Credit Fearful: Age Groups" # Title "Credit Fearful: Age Groups".
);


# Create a histogram of the "AGE" column with 10 bins
df_fear["AGE"].hist(bins=10)
plt.xlabel("Age")  # Label the x-axis "Age"
plt.ylabel("Frequency (count)") # Label the y-axis "Frequency (count)"
plt.title("Credit Fearful: Age Distribution"); # Title "Credit Fearful: Age Distribution"


race_dict = {
    1: "White/Non-Hispanic",
    2: "Black/African-American",
    3: "Hispanic",
    5: "Other",
}
race = df_fear["RACE"].replace(race_dict)
race.head(10)

5      White/Non-Hispanic
6      White/Non-Hispanic
7      White/Non-Hispanic
8      White/Non-Hispanic
9      White/Non-Hispanic
110    White/Non-Hispanic
111    White/Non-Hispanic
112    White/Non-Hispanic
113    White/Non-Hispanic
114    White/Non-Hispanic
Name: RACE, dtype: object


race_dict = {
    1: "White/Non-Hispanic",
    2: "Black/African-American",
    3: "Hispanic",
    5: "Other",
}
race = df_fear["RACE"].replace(race_dict)
race_value_counts = race.value_counts(normalize=True)
race_value_counts

White/Non-Hispanic        0.539477
Black/African-American    0.237508
Hispanic                  0.162232
Other                     0.060783
Name: RACE, dtype: float64


# Create a horizontal bar chart showing the normalized value counts for "RACE"
race_dict = {
    1: "White/Non-Hispanic",
    2: "Black/African-American",
    3: "Hispanic",
    5: "Other",
}
race = df_fear["RACE"].replace(race_dict) # Replace the numerical values with the true group names
race_value_counts = race.value_counts(normalize=True)  

# Create bar chart of race_value_counts
race_value_counts.plot(kind="barh")
plt.xlim((0, 1))
plt.xlabel("Frequency (%)") # Label the x-axis "Frequency (%)"
plt.ylabel("Race") # Label the y-axis "Race"
plt.title("Credit Fearful: Racial Groups"); # Title "Credit Fearful: Racial Groups"


# Recreate the horizontal bar chart I just made using the entire dataset df instead of the subset df_fear
race = df["RACE"].replace(race_dict)
race_value_counts = race.value_counts(normalize=True)
# Create bar chart of race_value_counts
race_value_counts.plot(kind="barh")
plt.xlim((0, 1))
plt.xlabel("Frequency (%)")
plt.ylabel("Race")
plt.title("SCF Respondents: Racial Groups");  # Title of this plot should be "SCF Respondents: Racial Groups"


inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (
    df["INCCAT"]
    .replace(inccat_dict)
    .groupby(df["TURNFEAR"])
    .value_counts(normalize=True)
    .rename("frequency")
    .to_frame()
    .reset_index()

)

df_inccat


inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (
    df["INCCAT"]
    .replace(inccat_dict)

)

df_inccat.head()

0    40-59.9
1    40-59.9
2    40-59.9
3    21-39.9
4    40-59.9
Name: INCCAT, dtype: object


inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (
    df["INCCAT"]
    .replace(inccat_dict)
    .groupby(df["TURNFEAR"])

)

df_inccat

<pandas.core.groupby.generic.SeriesGroupBy object at 0x144994d60>


inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (
    df["INCCAT"]
    .replace(inccat_dict)
    .groupby(df["TURNFEAR"])
    .value_counts(normalize=True)

)

df_inccat

TURNFEAR  INCCAT 
0         90-100     0.297296
          60-79.9    0.174841
          40-59.9    0.143146
          0-20       0.140343
          21-39.9    0.135933
          80-89.9    0.108441
1         0-20       0.288125
          21-39.9    0.256327
          40-59.9    0.228856
          60-79.9    0.132598
          90-100     0.048886
          80-89.9    0.045209
Name: INCCAT, dtype: float64


inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (
    df["INCCAT"]
    .replace(inccat_dict)
    .groupby(df["TURNFEAR"])
    .value_counts(normalize=True)
    .rename("frequency")

)

df_inccat

TURNFEAR  INCCAT 
0         90-100     0.297296
          60-79.9    0.174841
          40-59.9    0.143146
          0-20       0.140343
          21-39.9    0.135933
          80-89.9    0.108441
1         0-20       0.288125
          21-39.9    0.256327
          40-59.9    0.228856
          60-79.9    0.132598
          90-100     0.048886
          80-89.9    0.045209
Name: frequency, dtype: float64


inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (
    df["INCCAT"]
    .replace(inccat_dict)
    .groupby(df["TURNFEAR"])
    .value_counts(normalize=True)
    .rename("frequency")
    .to_frame()

)

df_inccat


# Create a DataFrame df_inccat that shows the normalized frequency for income categories for both the credit fearful and non-credit fearful households in the dataset
inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (
    df["INCCAT"]
    .replace(inccat_dict)
    .groupby(df["TURNFEAR"])
    .value_counts(normalize=True)
    .rename("frequency")
    .to_frame()
    .reset_index()

)

df_inccat


# Create bar chart of `df_inccat`
sns.barplot(
    x="INCCAT", # Income categories are in the correct order along the x-axis
    y="frequency",
    hue="TURNFEAR", # Set hue to "TURNFEAR"
    data=df_inccat,
    order=inccat_dict.values()
)
plt.xlabel("Income Category") # Label x-axis "Income Category"
plt.ylabel("Frequency (%)")   # Label y-axis "Frequency (%)"
plt.title("Income Distribution: Credit Fearful vs. Non-fearful"); # Title "Income Distribution: Credit Fearful vs. Non-fearful"


# Calculate the correlation coefficient for "ASSET" and "HOUSES" in the whole dataset df
asset_house_corr = df["ASSET"].corr(df["HOUSES"])
print("SCF: Asset Houses Correlation:", asset_house_corr)

SCF: Asset Houses Correlation: 0.5198273544779252


# Calculate the correlation coefficient for "ASSET" and "HOUSES" in the whole credit-fearful subset df_fear
asset_house_corr = df_fear["ASSET"].corr(df_fear["HOUSES"])
print("Credit Fearful: Asset Houses Correlation:", asset_house_corr)

Credit Fearful: Asset Houses Correlation: 0.5832879735979152


cols = ["ASSET", "HOUSES", "INCOME", "DEBT", "EDUC"]
corr = df[cols]
corr.head()


cols = ["ASSET", "HOUSES", "INCOME", "DEBT", "EDUC"]
corr = df[cols].corr()
corr


# Make a correlation matrix using df, considering only the columns "ASSET", "HOUSES", "INCOME", "DEBT", and "EDUC"
cols = ["ASSET", "HOUSES", "INCOME", "DEBT", "EDUC"]
corr = df[cols].corr()
corr.style.background_gradient(axis=None)


# Make a correlation matrix using df_fear
corr = df_fear[cols].corr()
corr.style.background_gradient(axis=None)


df_educ = (
    df["EDUC"]
)
df_educ.head()

0    12
1    12
2    12
3    12
4    12
Name: EDUC, dtype: int64


df_educ = (
    df["EDUC"]
    .groupby(df["TURNFEAR"])
)
df_educ

<pandas.core.groupby.generic.SeriesGroupBy object at 0x144d1aa00>


df_educ = (
    df["EDUC"]
    .groupby(df["TURNFEAR"])
    .value_counts(normalize=True)
)
df_educ.head()

TURNFEAR  EDUC
0         12      0.257481
          8       0.192029
          13      0.149823
          9       0.129833
          14      0.096117
Name: EDUC, dtype: float64


df_educ = (
    df["EDUC"]
    .groupby(df["TURNFEAR"])
    .value_counts(normalize=True)
    .rename("frequency")
)
df_educ.head()

TURNFEAR  EDUC
0         12      0.257481
          8       0.192029
          13      0.149823
          9       0.129833
          14      0.096117
Name: frequency, dtype: float64


df_educ = (
    df["EDUC"]
    .groupby(df["TURNFEAR"])
    .value_counts(normalize=True)
    .rename("frequency")
    .to_frame()
)
df_educ.head()


# Create a DataFrame df_educ that shows the normalized frequency for education categories for both the credit fearful and non-credit fearful households in the dataset
df_educ = (
    df["EDUC"]
    .groupby(df["TURNFEAR"])
    .value_counts(normalize=True)
    .rename("frequency")
    .to_frame()
    .reset_index()
)
df_educ.head()


# Create bar chart of `df_educ`
sns.barplot(
    x="EDUC", # Make sure that the education categories are in the correct order along the x-axis
    y="frequency",
    hue="TURNFEAR", # Set hue to "TURNFEAR"
    data=df_educ
)
plt.xlabel("Education Level")  # Label the x-axis "Education Level"
plt.ylabel("Frequency (%)")    # Label the y-axis "Frequency (%)"
plt.title("Educational Attainment: Credit Fearful vs. Non-fearful"); # Title "Educational Attainment: Credit Fearful vs. Non-fearful"


# Use df to make a scatter plot showing the relationship between DEBT and ASSET
df.plot.scatter(x="DEBT", y="ASSET");


# Use df_fear to make a scatter plot showing the relationship between DEBT and ASSET
df_fear.plot.scatter(x="DEBT", y="ASSET");


# Use df to make a scatter plot showing the relationship between HOUSES and DEBT
df.plot.scatter(x="DEBT", y="HOUSES");


# Use df_fear to make a scatter plot showing the relationship between HOUSES and DEBT
df_fear.plot.scatter(x="DEBT", y="HOUSES");

	YY1	Y1	WGT	HHSEX	AGE	AGECL	EDUC	EDCL	MARRIED	...	NWCAT	INCCAT	ASSETCAT	NINCCAT	NINC2CAT	NWPCTLECAT	INCPCTLECAT	NINCPCTLECAT	INCQRTCAT	NINCQRTCAT
0	1	11	6119.779308	2	75	6	12	4	2	...	5	3	6	3	2	10	6	6	3	3
1	1	12	4712.374912	2	75	6	12	4	2	...	5	3	6	3	1	10	5	5	2	2
2	1	13	5145.224455	2	75	6	12	4	2	...	5	3	6	3	1	10	5	5	2	2
3	1	14	5297.663412	2	75	6	12	4	2	...	5	2	6	2	1	10	4	4	2	2
4	1	15	4761.812371	2	75	6	12	4	2	...	5	3	6	3	1	10	5	5	2	2

	YY1	Y1	WGT	HHSEX	AGE	AGECL	EDUC	EDCL	MARRIED	KIDS	...	NWCAT	INCCAT	ASSETCAT	NINCCAT	NINC2CAT	NWPCTLECAT	INCPCTLECAT	NINCPCTLECAT	INCQRTCAT	NINCQRTCAT
5	2	21	3790.476607	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2
6	2	22	3798.868505	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	3	2	2
7	2	23	3799.468393	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2
8	2	24	3788.076005	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2
9	2	25	3793.066589	1	50	3	8	2	1	3	...	1	2	1	2	1	1	4	4	2	2

	ASSET	HOUSES	INCOME	DEBT	EDUC
ASSET	1.000000	0.519827	0.622429	0.261250	0.116673
HOUSES	0.519827	1.000000	0.247852	0.266661	0.169300
INCOME	0.622429	0.247852	1.000000	0.114646	0.069400
DEBT	0.261250	0.266661	0.114646	1.000000	0.054179
EDUC	0.116673	0.169300	0.069400	0.054179	1.000000

	ASSET	HOUSES	INCOME	DEBT	EDUC
ASSET	1.000000	0.519827	0.622429	0.261250	0.116673
HOUSES	0.519827	1.000000	0.247852	0.266661	0.169300
INCOME	0.622429	0.247852	1.000000	0.114646	0.069400
DEBT	0.261250	0.266661	0.114646	1.000000	0.054179
EDUC	0.116673	0.169300	0.069400	0.054179	1.000000

	ASSET	HOUSES	INCOME	DEBT	EDUC
ASSET	1.000000	0.583288	0.722074	0.474658	0.113536
HOUSES	0.583288	1.000000	0.264099	0.962629	0.160348
INCOME	0.722074	0.264099	1.000000	0.172393	0.133170
DEBT	0.474658	0.962629	0.172393	1.000000	0.177386
EDUC	0.113536	0.160348	0.133170	0.177386	1.000000

Prepare Data¶

Import¶

Explore¶

Age¶

Race¶

Income¶

Assets¶

Education¶

Debt¶

	TURNFEAR	INCCAT	frequency
0	0	90-100	0.297296
1	0	60-79.9	0.174841
2	0	40-59.9	0.143146
3	0	0-20	0.140343
4	0	21-39.9	0.135933
5	0	80-89.9	0.108441
6	1	0-20	0.288125
7	1	21-39.9	0.256327
8	1	40-59.9	0.228856
9	1	60-79.9	0.132598
10	1	90-100	0.048886
11	1	80-89.9	0.045209

		frequency
TURNFEAR	INCCAT
0	90-100	0.297296
	60-79.9	0.174841
	40-59.9	0.143146
	0-20	0.140343
	21-39.9	0.135933
	80-89.9	0.108441
1	0-20	0.288125
	21-39.9	0.256327
	40-59.9	0.228856
	60-79.9	0.132598
	90-100	0.048886
	80-89.9	0.045209

	ASSET	HOUSES	INCOME	EDUC
0	2153600.0	1100000.0	67195.781504	12
1	2116200.0	1100000.0	57014.602488	12
2	2145000.0	1100000.0	51924.012980	12
3	2552500.0	1100000.0	41742.833964	12
4	2176200.0	1100000.0	50905.895078	12

		frequency
TURNFEAR	EDUC
0	12	0.257481
	8	0.192029
	13	0.149823
	9	0.129833
	14	0.096117