__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import gzip
import json
import pickle
import matplotlib.pyplot as plt
import pandas as pd

from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline


def wrangle(filename):
    
    # Open compressed file, load into dictionary
    with gzip.open(filename, "r") as f:
        data = json.load(f)

    # Load dictionary into DataFrame, set index
    df = pd.DataFrame().from_dict(data["data"]).set_index("company_id")
    
    return df


df = wrangle("data/poland-bankruptcy-data-2009.json.gz")
print(df.shape)
df.head()

(9977, 65)


# Create my feature matrix X and target vector y
target = "bankrupt"            # My target is "bankrupt"
feature = "feat_27"
X = df.drop(columns="bankrupt")
y = df[target]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (9977, 64)
y shape: (9977,)


# Divide my data (X and y) into training and test sets using a randomized train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42             #  My test set should be 20% of my total data & set a random_state for reproducibility
)  

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (7981, 64)
y_train shape: (7981,)
X_test shape: (1996, 64)
y_test shape: (1996,)


# Create a new feature matrix X_train_over and target vector y_train_over by performing random over-sampling on the training data
over_sampler = RandomOverSampler(random_state=42)
X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)
print("X_train_over shape:", X_train_over.shape)
X_train_over.head()

X_train_over shape: (15194, 64)


# Calculate the baseline accuracy score for my model
acc_baseline = y_train.value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline, 4))

Baseline Accuracy: 0.9519


# Create a pipeline named clf (short for "classifier") that contains a SimpleImputer transformer and a RandomForestClassifier predictor
clf = make_pipeline(SimpleImputer(), RandomForestClassifier(random_state=42))
print(clf)

Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=42))])


cv_acc_scores = cross_val_score(clf, X_train_over, y_train_over, cv=5, n_jobs=-1)
print(cv_acc_scores)

[0.99670944 0.99835472 0.99769661 0.9970385  0.99901251]


params = {
    "simpleimputer__strategy": ["mean", "median"],
    "randomforestclassifier__n_estimators": range(25, 100, 25),
    "randomforestclassifier__max_depth": range(10, 50, 10)
}
params

{'simpleimputer__strategy': ['mean', 'median'],
 'randomforestclassifier__n_estimators': range(25, 100, 25),
 'randomforestclassifier__max_depth': range(10, 50, 10)}


model = GridSearchCV(
    clf,
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=1
)
model

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('simpleimputer', SimpleImputer()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': range(10, 50, 10),
                         'randomforestclassifier__n_estimators': range(25, 100, 25),
                         'simpleimputer__strategy': ['mean', 'median']},
             verbose=1)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('simpleimputer', SimpleImputer()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': range(10, 50, 10),
                         'randomforestclassifier__n_estimators': range(25, 100, 25),
                         'simpleimputer__strategy': ['mean', 'median']},
             verbose=1)

Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=42))])

SimpleImputer()

RandomForestClassifier(random_state=42)


# Train model
model.fit(X_train_over, y_train_over)

Fitting 5 folds for each of 24 candidates, totalling 120 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('simpleimputer', SimpleImputer()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': range(10, 50, 10),
                         'randomforestclassifier__n_estimators': range(25, 100, 25),
                         'simpleimputer__strategy': ['mean', 'median']},
             verbose=1)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('simpleimputer', SimpleImputer()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': range(10, 50, 10),
                         'randomforestclassifier__n_estimators': range(25, 100, 25),
                         'simpleimputer__strategy': ['mean', 'median']},
             verbose=1)

Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=42))])

SimpleImputer()

RandomForestClassifier(random_state=42)


cv_results = pd.DataFrame(model.cv_results_)
cv_results.head(10)


type(model.cv_results_)

dict


model.cv_results_

{'mean_fit_time': array([1.00355539, 1.07836399, 1.94340334, 1.98407006, 2.77629285,
        2.90537648, 1.07852516, 1.19905667, 2.20930009, 2.41117682,
        3.42673826, 3.29229503, 1.08620858, 1.15216389, 2.10699034,
        2.23801274, 3.15159402, 3.28632994, 1.06176701, 1.17954669,
        2.14178104, 2.41711006, 3.24803786, 2.77838759]),
 'std_fit_time': array([0.03856819, 0.03540615, 0.05185897, 0.04574253, 0.07909457,
        0.06405301, 0.02560595, 0.0390085 , 0.01599794, 0.11240178,
        0.0487337 , 0.08662619, 0.02143532, 0.02709878, 0.06299303,
        0.04703319, 0.0371392 , 0.04001555, 0.00673546, 0.01877013,
        0.07555698, 0.04551205, 0.10208889, 0.31279351]),
 'mean_score_time': array([0.01466966, 0.01207418, 0.02495403, 0.02191372, 0.02949762,
        0.03006196, 0.0140924 , 0.01335678, 0.02198453, 0.02545037,
        0.03338242, 0.02829528, 0.01157928, 0.01366849, 0.02265434,
        0.02950273, 0.0344842 , 0.04188023, 0.01282077, 0.01302428,
        0.02575898, 0.02508702, 0.03186536, 0.02331223]),
 'std_score_time': array([0.0086213 , 0.00171805, 0.00737565, 0.00655867, 0.00390039,
        0.0061574 , 0.00209357, 0.00412691, 0.00286714, 0.00659942,
        0.00355047, 0.00348966, 0.00185703, 0.00211356, 0.00374876,
        0.00604901, 0.00385735, 0.00359888, 0.00208611, 0.0017833 ,
        0.00440169, 0.00480983, 0.00379528, 0.00049813]),
 'param_randomforestclassifier__max_depth': masked_array(data=[10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 30, 30,
                    30, 30, 30, 30, 40, 40, 40, 40, 40, 40],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_randomforestclassifier__n_estimators': masked_array(data=[25, 25, 50, 50, 75, 75, 25, 25, 50, 50, 75, 75, 25, 25,
                    50, 50, 75, 75, 25, 25, 50, 50, 75, 75],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_simpleimputer__strategy': masked_array(data=['mean', 'median', 'mean', 'median', 'mean', 'median',
                    'mean', 'median', 'mean', 'median', 'mean', 'median',
                    'mean', 'median', 'mean', 'median', 'mean', 'median',
                    'mean', 'median', 'mean', 'median', 'mean', 'median'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'randomforestclassifier__max_depth': 10,
   'randomforestclassifier__n_estimators': 25,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 10,
   'randomforestclassifier__n_estimators': 25,
   'simpleimputer__strategy': 'median'},
  {'randomforestclassifier__max_depth': 10,
   'randomforestclassifier__n_estimators': 50,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 10,
   'randomforestclassifier__n_estimators': 50,
   'simpleimputer__strategy': 'median'},
  {'randomforestclassifier__max_depth': 10,
   'randomforestclassifier__n_estimators': 75,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 10,
   'randomforestclassifier__n_estimators': 75,
   'simpleimputer__strategy': 'median'},
  {'randomforestclassifier__max_depth': 20,
   'randomforestclassifier__n_estimators': 25,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 20,
   'randomforestclassifier__n_estimators': 25,
   'simpleimputer__strategy': 'median'},
  {'randomforestclassifier__max_depth': 20,
   'randomforestclassifier__n_estimators': 50,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 20,
   'randomforestclassifier__n_estimators': 50,
   'simpleimputer__strategy': 'median'},
  {'randomforestclassifier__max_depth': 20,
   'randomforestclassifier__n_estimators': 75,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 20,
   'randomforestclassifier__n_estimators': 75,
   'simpleimputer__strategy': 'median'},
  {'randomforestclassifier__max_depth': 30,
   'randomforestclassifier__n_estimators': 25,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 30,
   'randomforestclassifier__n_estimators': 25,
   'simpleimputer__strategy': 'median'},
  {'randomforestclassifier__max_depth': 30,
   'randomforestclassifier__n_estimators': 50,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 30,
   'randomforestclassifier__n_estimators': 50,
   'simpleimputer__strategy': 'median'},
  {'randomforestclassifier__max_depth': 30,
   'randomforestclassifier__n_estimators': 75,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 30,
   'randomforestclassifier__n_estimators': 75,
   'simpleimputer__strategy': 'median'},
  {'randomforestclassifier__max_depth': 40,
   'randomforestclassifier__n_estimators': 25,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 40,
   'randomforestclassifier__n_estimators': 25,
   'simpleimputer__strategy': 'median'},
  {'randomforestclassifier__max_depth': 40,
   'randomforestclassifier__n_estimators': 50,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 40,
   'randomforestclassifier__n_estimators': 50,
   'simpleimputer__strategy': 'median'},
  {'randomforestclassifier__max_depth': 40,
   'randomforestclassifier__n_estimators': 75,
   'simpleimputer__strategy': 'mean'},
  {'randomforestclassifier__max_depth': 40,
   'randomforestclassifier__n_estimators': 75,
   'simpleimputer__strategy': 'median'}],
 'split0_test_score': array([0.9792695 , 0.97861139, 0.98321816, 0.98025666, 0.98420533,
        0.98091477, 0.99638039, 0.99572228, 0.99670944, 0.99605133,
        0.99670944, 0.99572228, 0.99638039, 0.99638039, 0.99638039,
        0.99670944, 0.99670944, 0.99670944, 0.99638039, 0.99638039,
        0.99638039, 0.99670944, 0.99670944, 0.99670944]),
 'split1_test_score': array([0.97729516, 0.96906877, 0.97959855, 0.97104311, 0.98058572,
        0.97268838, 0.9970385 , 0.99670944, 0.99670944, 0.99736756,
        0.99736756, 0.99736756, 0.9970385 , 0.99670944, 0.99769661,
        0.99736756, 0.99769661, 0.99769661, 0.99670944, 0.99769661,
        0.99835472, 0.99835472, 0.99835472, 0.99835472]),
 'split2_test_score': array([0.97795327, 0.97564988, 0.97828233, 0.97894044, 0.97696611,
        0.98157289, 0.99572228, 0.99736756, 0.99572228, 0.99802567,
        0.99605133, 0.99802567, 0.99736756, 0.99901283, 0.99736756,
        0.99868378, 0.99769661, 0.99901283, 0.99736756, 0.99868378,
        0.9970385 , 0.99868378, 0.99769661, 0.99835472]),
 'split3_test_score': array([0.98091477, 0.97005594, 0.98453439, 0.97828233, 0.98453439,
        0.97992761, 0.99539322, 0.99638039, 0.99506417, 0.99670944,
        0.99638039, 0.9970385 , 0.99506417, 0.99638039, 0.9970385 ,
        0.9970385 , 0.9970385 , 0.9970385 , 0.99506417, 0.99670944,
        0.9970385 , 0.99670944, 0.99736756, 0.9970385 ]),
 'split4_test_score': array([0.98222515, 0.97498354, 0.98387097, 0.97761685, 0.9845293 ,
        0.98156682, 0.99736669, 0.99802502, 0.99736669, 0.99835418,
        0.99802502, 0.99835418, 0.99802502, 0.99901251, 0.99835418,
        0.99934167, 0.99868334, 0.99901251, 0.99802502, 0.99901251,
        0.99835418, 0.99901251, 0.99868334, 0.99901251]),
 'mean_test_score': array([0.97953157, 0.9736739 , 0.98190088, 0.97722788, 0.98216417,
        0.97933409, 0.99638021, 0.99684094, 0.9963144 , 0.99730164,
        0.99690675, 0.99730164, 0.99677513, 0.99749911, 0.99736745,
        0.99782819, 0.9975649 , 0.99789398, 0.99670931, 0.99769655,
        0.99743326, 0.99789398, 0.99776234, 0.99789398]),
 'std_test_score': array([0.0018292 , 0.00358595, 0.00248783, 0.00321313, 0.00299631,
        0.00337679, 0.00075013, 0.00079501, 0.00081647, 0.00084266,
        0.0007086 , 0.00091652, 0.00100656, 0.00124164, 0.00065795,
        0.00101094, 0.00067742, 0.00096715, 0.0009979 , 0.00104048,
        0.00078961, 0.00098928, 0.00070256, 0.000873  ]),
 'rank_test_score': array([21, 24, 20, 23, 19, 22, 17, 14, 18, 11, 13, 11, 15,  8, 10,  4,  7,
         2, 16,  6,  9,  1,  5,  2], dtype=int32)}


cv_results = pd.DataFrame(model.cv_results_)
cv_results.head(10)


# Create mask
mask = cv_results["param_randomforestclassifier__max_depth"] == 10
# Plot fit time vs n_estimators
plt.plot(
    cv_results[mask]["param_randomforestclassifier__n_estimators"],
    cv_results[mask]["mean_fit_time"]
)
# Label axes
plt.xlabel("Number of Estimators")
plt.ylabel("Mean Fit Time [seconds]")
plt.title("Training Time vs Estimators (max_depth=10)");


cv_results["param_randomforestclassifier__max_depth"] == 10

0      True
1      True
2      True
3      True
4      True
5      True
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
Name: param_randomforestclassifier__max_depth, dtype: bool


mask = cv_results["param_randomforestclassifier__max_depth"] == 10
cv_results[mask]


# Create mask
mask = cv_results["param_randomforestclassifier__max_depth"] == 10
# Plot fit time vs n_estimators
plt.plot(
    cv_results[mask]["param_randomforestclassifier__n_estimators"],
    cv_results[mask]["mean_fit_time"]
)
# Label axes
plt.xlabel("Number of Estimators")
plt.ylabel("Mean Fit Time [seconds]")
plt.title("Training Time vs Estimators (max_depth=10)");


# Create mask
mask = cv_results["param_randomforestclassifier__n_estimators"] == 25

# Plot fit time vs max_depth
plt.plot(
    cv_results[mask]["param_randomforestclassifier__max_depth"],
    cv_results[mask]["mean_fit_time"]

)

# Label axes
plt.xlabel("Max Depth")
plt.ylabel("Mean Fit Time [seconds]")
plt.title("Training Time vs Max Depth (n_estimators=25)");


cv_results[mask]


cv_results[mask][["mean_fit_time", "param_randomforestclassifier__max_depth", "param_simpleimputer__strategy"]]


# Extract best hyperparameters
model.best_params_

{'randomforestclassifier__max_depth': 40,
 'randomforestclassifier__n_estimators': 50,
 'simpleimputer__strategy': 'median'}


model.best_score_

0.9978939791055105


model.best_estimator_

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=40, n_estimators=50,
                                        random_state=42))])

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=40, n_estimators=50,
                                        random_state=42))])

SimpleImputer(strategy='median')

RandomForestClassifier(max_depth=40, n_estimators=50, random_state=42)


model.predict(X_train_over)

array([False, False, False, ...,  True,  True,  True])


# Calculate the training and test accuracy scores for model
acc_train = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)

print("Training Accuracy:", round(acc_train, 4))
print("Test Accuracy:", round(acc_test, 4))

Training Accuracy: 1.0
Test Accuracy: 0.9589


y_test.value_counts()

False    1913
True       83
Name: bankrupt, dtype: int64


# Plot confusion matrix
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test);


features = X_train_over.columns
features

Index(['feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6', 'feat_7',
       'feat_8', 'feat_9', 'feat_10', 'feat_11', 'feat_12', 'feat_13',
       'feat_14', 'feat_15', 'feat_16', 'feat_17', 'feat_18', 'feat_19',
       'feat_20', 'feat_21', 'feat_22', 'feat_23', 'feat_24', 'feat_25',
       'feat_26', 'feat_27', 'feat_28', 'feat_29', 'feat_30', 'feat_31',
       'feat_32', 'feat_33', 'feat_34', 'feat_35', 'feat_36', 'feat_37',
       'feat_38', 'feat_39', 'feat_40', 'feat_41', 'feat_42', 'feat_43',
       'feat_44', 'feat_45', 'feat_46', 'feat_47', 'feat_48', 'feat_49',
       'feat_50', 'feat_51', 'feat_52', 'feat_53', 'feat_54', 'feat_55',
       'feat_56', 'feat_57', 'feat_58', 'feat_59', 'feat_60', 'feat_61',
       'feat_62', 'feat_63', 'feat_64'],
      dtype='object')


importances = model.best_estimator_.named_steps[
    "randomforestclassifier"
].feature_importances_
importances

array([0.0101393 , 0.01287852, 0.01147172, 0.00905564, 0.02058596,
       0.02059257, 0.01113304, 0.01431152, 0.01338085, 0.0104422 ,
       0.00933864, 0.01591063, 0.0350984 , 0.00983862, 0.01471214,
       0.02722643, 0.00894526, 0.01105401, 0.01444032, 0.01044288,
       0.01948826, 0.01151105, 0.0140539 , 0.05165966, 0.01758254,
       0.02942304, 0.07060223, 0.0111041 , 0.01661098, 0.01036807,
       0.0133371 , 0.00643053, 0.00744856, 0.02801246, 0.02282155,
       0.00994131, 0.01103002, 0.02258117, 0.02095925, 0.01482569,
       0.01687734, 0.01275205, 0.00923591, 0.01261843, 0.01882168,
       0.03347121, 0.01110962, 0.01235876, 0.01284221, 0.00787817,
       0.01095399, 0.00680687, 0.00824845, 0.01027538, 0.01405089,
       0.01526372, 0.00959164, 0.02134601, 0.00752884, 0.01034815,
       0.00942581, 0.00831442, 0.00730218, 0.01178816])


feat_imp = pd.Series(importances, index=features)
feat_imp

feat_1     0.010139
feat_2     0.012879
feat_3     0.011472
feat_4     0.009056
feat_5     0.020586
             ...   
feat_60    0.010348
feat_61    0.009426
feat_62    0.008314
feat_63    0.007302
feat_64    0.011788
Length: 64, dtype: float64


feat_imp = pd.Series(importances, index=features).sort_values()
feat_imp

feat_32    0.006431
feat_52    0.006807
feat_63    0.007302
feat_33    0.007449
feat_59    0.007529
             ...   
feat_26    0.029423
feat_46    0.033471
feat_13    0.035098
feat_24    0.051660
feat_27    0.070602
Length: 64, dtype: float64


# Get feature names from training data
features = X_train_over.columns

# Extract importances from model
importances = model.best_estimator_.named_steps[
    "randomforestclassifier"
].feature_importances_

# Create a series with feature names and importances
feat_imp = pd.Series(importances, index=features).sort_values()

# Plot 10 most important features
feat_imp.tail(10).plot(kind="barh")
plt.xlabel("Gini Importance")
plt.ylabel("Feature")
plt.title("Feature Importance");


# Save model
with open("model-5-3.pkl", "wb") as f:
    pickle.dump(model, f)


def make_predictions(data_filepath, model_filepath):
    # Wrangle JSON file
    X_test = wrangle(data_filepath)
    # Load model
    with open(model_filepath, "rb") as f:
        model = pickle.load(f)
    # Generate predictions
    y_test_pred = model.predict(X_test)
    # Put predictions into Series with name "bankrupt", and same index as X_test
    y_test_pred = pd.Series(y_test_pred, index=X_test.index, name="bankrupt")
    return y_test_pred


# Check my make_predictions function
y_test_pred = make_predictions(
    data_filepath="data/poland-bankruptcy-data-2009-mvp-features.json.gz",
    model_filepath="model-5-3.pkl",
)

print("predictions shape:", y_test_pred.shape)
y_test_pred.head()

predictions shape: (526,)

company_id
4     False
32    False
34    False
36    False
40    False
Name: bankrupt, dtype: bool

	feat_1	feat_2	feat_3	feat_4	feat_5	feat_6	feat_7	feat_8	feat_9	feat_10	...	feat_56	feat_57	feat_58	feat_59	feat_60	feat_61	feat_62	feat_63	feat_64	bankrupt
company_id
1	0.174190	0.41299	0.14371	1.3480	-28.9820	0.60383	0.219460	1.12250	1.1961	0.46359	...	0.163960	0.375740	0.83604	0.000007	9.7145	6.2813	84.291	4.3303	4.0341	False
2	0.146240	0.46038	0.28230	1.6294	2.5952	0.00000	0.171850	1.17210	1.6018	0.53962	...	0.027516	0.271000	0.90108	0.000000	5.9882	4.1103	102.190	3.5716	5.9500	False
3	0.000595	0.22612	0.48839	3.1599	84.8740	0.19114	0.004572	2.98810	1.0077	0.67566	...	0.007639	0.000881	0.99236	0.000000	6.7742	3.7922	64.846	5.6287	4.4581	False
5	0.188290	0.41504	0.34231	1.9279	-58.2740	0.00000	0.233580	1.40940	1.3393	0.58496	...	0.176480	0.321880	0.82635	0.073039	2.5912	7.0756	100.540	3.6303	4.6375	False
6	0.182060	0.55615	0.32191	1.6045	16.3140	0.00000	0.182060	0.79808	1.8126	0.44385	...	0.555770	0.410190	0.46957	0.029421	8.4553	3.3488	107.240	3.4036	12.4540	False

	feat_1	feat_2	feat_3	feat_4	feat_5	feat_6	feat_7	feat_8	feat_9	feat_10	...	feat_55	feat_56	feat_57	feat_58	feat_59	feat_60	feat_61	feat_62	feat_63	feat_64
0	0.279320	0.053105	0.852030	17.0440	199.080	0.741770	0.353570	16.00600	1.2346	0.84997	...	52857.00	0.190040	0.328630	0.80996	0.00000	NaN	4.1858	11.002	33.1760	18.5720
1	0.001871	0.735120	0.156460	1.2269	-10.837	0.000000	0.002938	0.36032	1.4809	0.26488	...	440.02	0.014794	0.007064	0.99803	0.00000	7.4268	2.2925	169.960	2.1476	9.6185
2	0.113940	0.490250	0.077121	1.2332	-43.184	-0.000171	0.113940	1.03980	1.1649	0.50975	...	4617.40	0.214890	0.223520	0.78761	0.27412	6.2791	6.1622	103.630	3.5220	1.9673
3	0.008136	0.652610	0.148120	1.2628	29.071	0.000000	0.008136	0.53230	1.2891	0.34739	...	920.98	0.045169	0.023421	0.99434	0.14403	22.7480	2.2673	159.580	2.2872	4.4718
4	0.045396	0.279640	0.708730	3.7656	238.120	0.000000	0.056710	2.57610	1.0169	0.72036	...	10744.00	0.047501	0.063019	0.94624	0.00000	13.8860	49.0660	91.984	3.9681	29.0460

Prepare Data¶

Split¶

Resample¶

Build Model¶

Baseline¶

Iterate¶

Evaluate¶

Communicate¶

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_randomforestclassifier__max_depth	param_randomforestclassifier__n_estimators	param_simpleimputer__strategy	params	split0_test_score	split1_test_score	split2_test_score	split3_test_score	split4_test_score	mean_test_score	std_test_score	rank_test_score
0	1.003555	0.038568	0.014670	0.008621	10	25	mean	{'randomforestclassifier__max_depth': 10, 'ran...	0.979269	0.977295	0.977953	0.980915	0.982225	0.979532	0.001829	21
1	1.078364	0.035406	0.012074	0.001718	10	25	median	{'randomforestclassifier__max_depth': 10, 'ran...	0.978611	0.969069	0.975650	0.970056	0.974984	0.973674	0.003586	24
2	1.943403	0.051859	0.024954	0.007376	10	50	mean	{'randomforestclassifier__max_depth': 10, 'ran...	0.983218	0.979599	0.978282	0.984534	0.983871	0.981901	0.002488	20
3	1.984070	0.045743	0.021914	0.006559	10	50	median	{'randomforestclassifier__max_depth': 10, 'ran...	0.980257	0.971043	0.978940	0.978282	0.977617	0.977228	0.003213	23
4	2.776293	0.079095	0.029498	0.003900	10	75	mean	{'randomforestclassifier__max_depth': 10, 'ran...	0.984205	0.980586	0.976966	0.984534	0.984529	0.982164	0.002996	19
5	2.905376	0.064053	0.030062	0.006157	10	75	median	{'randomforestclassifier__max_depth': 10, 'ran...	0.980915	0.972688	0.981573	0.979928	0.981567	0.979334	0.003377	22
6	1.078525	0.025606	0.014092	0.002094	20	25	mean	{'randomforestclassifier__max_depth': 20, 'ran...	0.996380	0.997038	0.995722	0.995393	0.997367	0.996380	0.000750	17
7	1.199057	0.039009	0.013357	0.004127	20	25	median	{'randomforestclassifier__max_depth': 20, 'ran...	0.995722	0.996709	0.997368	0.996380	0.998025	0.996841	0.000795	14
8	2.209300	0.015998	0.021985	0.002867	20	50	mean	{'randomforestclassifier__max_depth': 20, 'ran...	0.996709	0.996709	0.995722	0.995064	0.997367	0.996314	0.000816	18
9	2.411177	0.112402	0.025450	0.006599	20	50	median	{'randomforestclassifier__max_depth': 20, 'ran...	0.996051	0.997368	0.998026	0.996709	0.998354	0.997302	0.000843	11