__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import gzip
import json
import pandas as pd


%%bash
cd data
gzip -dkf poland-bankruptcy-data-2009.json.gz


# Open file and load JSON
with open("data/poland-bankruptcy-data-2009.json", "r") as read_file:
    poland_data = json.load(read_file)   # Load it as a dictionary with the variable name poland_data

print(type(poland_data))

<class 'dict'>


# Print `poland_data` keys
poland_data.keys()

dict_keys(['schema', 'data', 'metadata'])


# Continue Exploring `poland_data`
poland_data["metadata"]

{'title': 'Ensemble Boosted Trees with Synthetic Features Generation in Application to Bankruptcy Prediction',
 'authors': 'Zieba, M., Tomczak, S. K., & Tomczak, J. M.',
 'journal': 'Expert Systems with Applications',
 'publicationYear': 2016,
 'dataYear': 2009,
 'articleLink': 'doi:10.1016/j.eswa.2016.04.001',
 'datasetLink': 'https://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data'}


poland_data["schema"].keys()

dict_keys(['fields', 'primaryKey', 'pandas_version'])


type(poland_data["data"])

list


poland_data["data"][0]

{'company_id': 1,
 'feat_1': 0.17419,
 'feat_2': 0.41299,
 'feat_3': 0.14371,
 'feat_4': 1.348,
 'feat_5': -28.982,
 'feat_6': 0.60383,
 'feat_7': 0.21946,
 'feat_8': 1.1225,
 'feat_9': 1.1961,
 'feat_10': 0.46359,
 'feat_11': 0.21946,
 'feat_12': 0.53139,
 'feat_13': 0.14233,
 'feat_14': 0.21946,
 'feat_15': 592.24,
 'feat_16': 0.6163,
 'feat_17': 2.4213,
 'feat_18': 0.21946,
 'feat_19': 0.12272,
 'feat_20': 37.573,
 'feat_21': 0.9969,
 'feat_22': 0.2951,
 'feat_23': 0.097402,
 'feat_24': 0.75641,
 'feat_25': 0.46359,
 'feat_26': 0.50669,
 'feat_27': 1.9737,
 'feat_28': 0.32417,
 'feat_29': 5.9473,
 'feat_30': 0.22493,
 'feat_31': 0.12272,
 'feat_32': 100.82,
 'feat_33': 3.6203,
 'feat_34': 0.71453,
 'feat_35': 0.2951,
 'feat_36': 1.8079,
 'feat_37': 123140.0,
 'feat_38': 0.46359,
 'feat_39': 0.16501,
 'feat_40': 0.21282,
 'feat_41': 0.041124,
 'feat_42': 0.16501,
 'feat_43': 95.682,
 'feat_44': 58.109,
 'feat_45': 0.94621,
 'feat_46': 0.90221,
 'feat_47': 44.941,
 'feat_48': 0.26003,
 'feat_49': 0.1454,
 'feat_50': 1.348,
 'feat_51': 0.41299,
 'feat_52': 0.27622,
 'feat_53': 1.0457,
 'feat_54': 1.0458,
 'feat_55': 127280.0,
 'feat_56': 0.16396,
 'feat_57': 0.37574,
 'feat_58': 0.83604,
 'feat_59': 6.5269e-06,
 'feat_60': 9.7145,
 'feat_61': 6.2813,
 'feat_62': 84.291,
 'feat_63': 4.3303,
 'feat_64': 4.0341,
 'bankrupt': False}


# Calculate the number of companies included in the dataset
type(poland_data["data"])

list


len(poland_data["data"])

9977


# Calculate number of features
type(poland_data["data"])

list


# Calculate the number of features associated with "company_1"
poland_data["data"][0]

{'company_id': 1,
 'feat_1': 0.17419,
 'feat_2': 0.41299,
 'feat_3': 0.14371,
 'feat_4': 1.348,
 'feat_5': -28.982,
 'feat_6': 0.60383,
 'feat_7': 0.21946,
 'feat_8': 1.1225,
 'feat_9': 1.1961,
 'feat_10': 0.46359,
 'feat_11': 0.21946,
 'feat_12': 0.53139,
 'feat_13': 0.14233,
 'feat_14': 0.21946,
 'feat_15': 592.24,
 'feat_16': 0.6163,
 'feat_17': 2.4213,
 'feat_18': 0.21946,
 'feat_19': 0.12272,
 'feat_20': 37.573,
 'feat_21': 0.9969,
 'feat_22': 0.2951,
 'feat_23': 0.097402,
 'feat_24': 0.75641,
 'feat_25': 0.46359,
 'feat_26': 0.50669,
 'feat_27': 1.9737,
 'feat_28': 0.32417,
 'feat_29': 5.9473,
 'feat_30': 0.22493,
 'feat_31': 0.12272,
 'feat_32': 100.82,
 'feat_33': 3.6203,
 'feat_34': 0.71453,
 'feat_35': 0.2951,
 'feat_36': 1.8079,
 'feat_37': 123140.0,
 'feat_38': 0.46359,
 'feat_39': 0.16501,
 'feat_40': 0.21282,
 'feat_41': 0.041124,
 'feat_42': 0.16501,
 'feat_43': 95.682,
 'feat_44': 58.109,
 'feat_45': 0.94621,
 'feat_46': 0.90221,
 'feat_47': 44.941,
 'feat_48': 0.26003,
 'feat_49': 0.1454,
 'feat_50': 1.348,
 'feat_51': 0.41299,
 'feat_52': 0.27622,
 'feat_53': 1.0457,
 'feat_54': 1.0458,
 'feat_55': 127280.0,
 'feat_56': 0.16396,
 'feat_57': 0.37574,
 'feat_58': 0.83604,
 'feat_59': 6.5269e-06,
 'feat_60': 9.7145,
 'feat_61': 6.2813,
 'feat_62': 84.291,
 'feat_63': 4.3303,
 'feat_64': 4.0341,
 'bankrupt': False}


len(poland_data["data"][0])

66


# Iterate through companies
for item in poland_data["data"]:
    if len(item) !=66:
        print("ALERT!!")


# Open compressed file and load contents
with gzip.open("data/poland-bankruptcy-data-2009.json.gz", "r") as read_file:
    poland_data_gz = json.load(read_file)

print(type(poland_data_gz))

<class 'dict'>


# Explore "poland_data_gz"
print(poland_data_gz.keys())
print(len(poland_data_gz["data"]))
print(len(poland_data_gz["data"][0]))

dict_keys(['schema', 'data', 'metadata'])
9977
66


print(len(poland_data_gz["data"]))

9977


df = pd.DataFrame().from_dict(poland_data_gz["data"])
print(df.shape)
df.head()

(9977, 66)


# Create a DataFrame df that contains the all companies in the dataset, indexed by "company_id"
df = pd.DataFrame().from_dict(poland_data_gz["data"]).set_index("company_id")
print(df.shape)
df.head()

(9977, 65)


# Create a wrangle function that takes the name of a compressed file as input and returns a tidy DataFrame
def wrangle(filename):
    # Open compressed file, load into dict
    with gzip.open(filename, "r") as f:
        data = json.load(f)
        
    # Turn dict into DataFrame
    df = pd.DataFrame().from_dict(data["data"]).set_index("company_id")
    
    return df

	company_id	feat_1	feat_2	feat_3	feat_4	feat_5	feat_6	feat_7	feat_8	feat_9	...	feat_56	feat_57	feat_58	feat_59	feat_60	feat_61	feat_62	feat_63	feat_64	bankrupt
0	1	0.174190	0.41299	0.14371	1.3480	-28.9820	0.60383	0.219460	1.12250	1.1961	...	0.163960	0.375740	0.83604	0.000007	9.7145	6.2813	84.291	4.3303	4.0341	False
1	2	0.146240	0.46038	0.28230	1.6294	2.5952	0.00000	0.171850	1.17210	1.6018	...	0.027516	0.271000	0.90108	0.000000	5.9882	4.1103	102.190	3.5716	5.9500	False
2	3	0.000595	0.22612	0.48839	3.1599	84.8740	0.19114	0.004572	2.98810	1.0077	...	0.007639	0.000881	0.99236	0.000000	6.7742	3.7922	64.846	5.6287	4.4581	False
3	5	0.188290	0.41504	0.34231	1.9279	-58.2740	0.00000	0.233580	1.40940	1.3393	...	0.176480	0.321880	0.82635	0.073039	2.5912	7.0756	100.540	3.6303	4.6375	False
4	6	0.182060	0.55615	0.32191	1.6045	16.3140	0.00000	0.182060	0.79808	1.8126	...	0.555770	0.410190	0.46957	0.029421	8.4553	3.3488	107.240	3.4036	12.4540	False

	feat_1	feat_2	feat_3	feat_4	feat_5	feat_6	feat_7	feat_8	feat_9	feat_10	...	feat_56	feat_57	feat_58	feat_59	feat_60	feat_61	feat_62	feat_63	feat_64	bankrupt
company_id
1	0.174190	0.41299	0.14371	1.3480	-28.9820	0.60383	0.219460	1.12250	1.1961	0.46359	...	0.163960	0.375740	0.83604	0.000007	9.7145	6.2813	84.291	4.3303	4.0341	False
2	0.146240	0.46038	0.28230	1.6294	2.5952	0.00000	0.171850	1.17210	1.6018	0.53962	...	0.027516	0.271000	0.90108	0.000000	5.9882	4.1103	102.190	3.5716	5.9500	False
3	0.000595	0.22612	0.48839	3.1599	84.8740	0.19114	0.004572	2.98810	1.0077	0.67566	...	0.007639	0.000881	0.99236	0.000000	6.7742	3.7922	64.846	5.6287	4.4581	False
5	0.188290	0.41504	0.34231	1.9279	-58.2740	0.00000	0.233580	1.40940	1.3393	0.58496	...	0.176480	0.321880	0.82635	0.073039	2.5912	7.0756	100.540	3.6303	4.6375	False
6	0.182060	0.55615	0.32191	1.6045	16.3140	0.00000	0.182060	0.79808	1.8126	0.44385	...	0.555770	0.410190	0.46957	0.029421	8.4553	3.3488	107.240	3.4036	12.4540	False

Prepare Data¶

Open¶

Import¶