__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import sqlite3
import warnings
import matplotlib.pyplot as plt
import pandas as pd

from category_encoders import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)


def wrangle(db_path):
    # Connect to database
    conn = sqlite3.connect(db_path)

    # Construct query
    query = """
        SELECT distinct(i.building_id) AS b_id,
           s.*,
           d.damage_grade
        FROM id_map AS i
        JOIN building_structure AS s ON i.building_id = s.building_id
        JOIN building_damage AS d ON i.building_id = d.building_id
        WHERE district_id = 4
    """

    # Read query results into DataFrame
    df = pd.read_sql(query, conn, index_col="b_id")

    # Identify leaky columns
    drop_cols = [col for col in df.columns if "post_eq" in col]

    # Add high-cardinality / redundant column
    drop_cols.append("building_id")

    # Create binary target column
    df["damage_grade"] = df["damage_grade"].str[-1].astype(int)
    df["severe_damage"] = (df["damage_grade"] > 3).astype(int)

    # Drop old target
    drop_cols.append("damage_grade")

    # Drop multicollinearity column
    drop_cols.append("count_floors_pre_eq")

    # Drop columns
    df.drop(columns=drop_cols, inplace=True)

    return df


# Use the wrangle function above to import my data set into the DataFrame df
df = wrangle("/home/jovyan/nepal.sqlite")  # Set path to the SQLite database as "/home/jovyan/nepal.sqlite"
df.head()


# Create my feature matrix X and target vector y
target = "severe_damage"     # My target is "severe_damage"
X = df.drop(columns=target)
y = df[target]


# Divide my data (X and y) into training and test sets using a randomized train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42              # My test set should be 20% of my total data & set a random_state for reproducibility
)


# Divide my training data (X_train and y_train) into training and validation sets using a randomized train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42  # My validation data should be 20% of the remaining data
)


# Calculate the baseline accuracy score for my model
acc_baseline = y_train.value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline, 2))

Baseline Accuracy: 0.64


# Build Model
model = make_pipeline(
    OrdinalEncoder(), DecisionTreeClassifier(random_state=42)   # Set a random_state for my predictor
)
# Fit model to training data
model.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['land_surface_condition',
                                      'foundation_type', 'roof_type',
                                      'ground_floor_type', 'other_floor_type',
                                      'position', 'plan_configuration',
                                      'superstructure'],
                                mapping=[{'col': 'land_surface_condition',
                                          'data_type': dtype('O'),
                                          'mapping': Flat              1
Moderate slope    2
Steep slope       3
NaN              -2
dtype: int64},
                                         {'col': 'foundation_type',
                                          'dat...
E-shape                             7
H-shape                             8
Others                              9
Building with Central Courtyard    10
NaN                                -2
dtype: int64},
                                         {'col': 'superstructure',
                                          'data_type': dtype('O'),
                                          'mapping': Stone, mud mortar        1
Stone                    2
RC, engineered           3
Brick, cement mortar     4
Adobe/mud                5
Timber                   6
RC, non-engineered       7
Brick, mud mortar        8
Stone, cement mortar     9
Bamboo                  10
Other                   11
NaN                     -2
dtype: int64}])),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(random_state=42))])

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['land_surface_condition',
                                      'foundation_type', 'roof_type',
                                      'ground_floor_type', 'other_floor_type',
                                      'position', 'plan_configuration',
                                      'superstructure'],
                                mapping=[{'col': 'land_surface_condition',
                                          'data_type': dtype('O'),
                                          'mapping': Flat              1
Moderate slope    2
Steep slope       3
NaN              -2
dtype: int64},
                                         {'col': 'foundation_type',
                                          'dat...
E-shape                             7
H-shape                             8
Others                              9
Building with Central Courtyard    10
NaN                                -2
dtype: int64},
                                         {'col': 'superstructure',
                                          'data_type': dtype('O'),
                                          'mapping': Stone, mud mortar        1
Stone                    2
RC, engineered           3
Brick, cement mortar     4
Adobe/mud                5
Timber                   6
RC, non-engineered       7
Brick, mud mortar        8
Stone, cement mortar     9
Bamboo                  10
Other                   11
NaN                     -2
dtype: int64}])),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(random_state=42))])

OrdinalEncoder(cols=['land_surface_condition', 'foundation_type', 'roof_type',
                     'ground_floor_type', 'other_floor_type', 'position',
                     'plan_configuration', 'superstructure'],
               mapping=[{'col': 'land_surface_condition',
                         'data_type': dtype('O'),
                         'mapping': Flat              1
Moderate slope    2
Steep slope       3
NaN              -2
dtype: int64},
                        {'col': 'foundation_type', 'data_type': dtype('O'),
                         'mapping': Mud mo...
                         'mapping': Rectangular                         1
T-shape                             2
Square                              3
L-shape                             4
Multi-projected                     5
U-shape                             6
E-shape                             7
H-shape                             8
Others                              9
Building with Central Courtyard    10
NaN                                -2
dtype: int64},
                        {'col': 'superstructure', 'data_type': dtype('O'),
                         'mapping': Stone, mud mortar        1
Stone                    2
RC, engineered           3
Brick, cement mortar     4
Adobe/mud                5
Timber                   6
RC, non-engineered       7
Brick, mud mortar        8
Stone, cement mortar     9
Bamboo                  10
Other                   11
NaN                     -2
dtype: int64}])

DecisionTreeClassifier(random_state=42)


# Calculate the training and validation accuracy scores for my models
acc_train = accuracy_score(y_train, model.predict(X_train))
acc_val = model.score(X_val, y_val)

print("Training Accuracy:", round(acc_train, 2))
print("Validation Accuracy:", round(acc_val, 2))

Training Accuracy: 0.98
Validation Accuracy: 0.65


# Use the get_depth method on the DecisionTreeClassifier in my model to see how deep my tree grew during training
tree_depth = model.named_steps["decisiontreeclassifier"].get_depth()
print("Tree Depth:", tree_depth)

Tree Depth: 49


# Create a range of possible values for max_depth hyperparameter of my model's DecisionTreeClassifier
depth_hyperparams = range(1, 50, 2) # depth_hyperparams should range from 1 to 50 by steps of 2


# Create empty lists for training and validation accuracy scores
training_acc = []
validation_acc = []

for d in depth_hyperparams:
    # Create model with `max_depth` of `d`
    test_model = make_pipeline(
        OrdinalEncoder(), 
        DecisionTreeClassifier(max_depth=d, random_state=42)
    )   
    # Fit model to training data
    test_model.fit(X_train, y_train)
    # Calculate training accuracy score and append to `training_acc`
    training_acc.append(test_model.score(X_train, y_train))
    # Calculate validation accuracy score and append to `training_acc`
    validation_acc.append(test_model.score(X_val, y_val))

print("Training Accuracy Scores:", training_acc[:3])
print("Validation Accuracy Scores:", validation_acc[:3])

Training Accuracy Scores: [0.7071072484228174, 0.7117395332421582, 0.7162394670666608]
Validation Accuracy Scores: [0.7088406564319746, 0.7132521616375508, 0.7166049055937886]


# Plot `depth_hyperparams`, `training_acc`
plt.plot(depth_hyperparams, training_acc, label="training")
plt.plot(depth_hyperparams, validation_acc, label="validation")
plt.xlabel("Max Depth")      # Label x-axis as "Max Depth"
plt.ylabel("Accuracy Score") # Label y-axis as "Accuracy Score"
plt.legend();                # Insert a legend so that the two lines can be distinguished


# Build Model
model = make_pipeline(
    OrdinalEncoder(), 
    DecisionTreeClassifier(max_depth=6, random_state=42)
)

# Fit model to training data
model.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['land_surface_condition',
                                      'foundation_type', 'roof_type',
                                      'ground_floor_type', 'other_floor_type',
                                      'position', 'plan_configuration',
                                      'superstructure'],
                                mapping=[{'col': 'land_surface_condition',
                                          'data_type': dtype('O'),
                                          'mapping': Flat              1
Moderate slope    2
Steep slope       3
NaN              -2
dtype: int64},
                                         {'col': 'foundation_type',
                                          'dat...
Others                              9
Building with Central Courtyard    10
NaN                                -2
dtype: int64},
                                         {'col': 'superstructure',
                                          'data_type': dtype('O'),
                                          'mapping': Stone, mud mortar        1
Stone                    2
RC, engineered           3
Brick, cement mortar     4
Adobe/mud                5
Timber                   6
RC, non-engineered       7
Brick, mud mortar        8
Stone, cement mortar     9
Bamboo                  10
Other                   11
NaN                     -2
dtype: int64}])),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(max_depth=6, random_state=42))])

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['land_surface_condition',
                                      'foundation_type', 'roof_type',
                                      'ground_floor_type', 'other_floor_type',
                                      'position', 'plan_configuration',
                                      'superstructure'],
                                mapping=[{'col': 'land_surface_condition',
                                          'data_type': dtype('O'),
                                          'mapping': Flat              1
Moderate slope    2
Steep slope       3
NaN              -2
dtype: int64},
                                         {'col': 'foundation_type',
                                          'dat...
Others                              9
Building with Central Courtyard    10
NaN                                -2
dtype: int64},
                                         {'col': 'superstructure',
                                          'data_type': dtype('O'),
                                          'mapping': Stone, mud mortar        1
Stone                    2
RC, engineered           3
Brick, cement mortar     4
Adobe/mud                5
Timber                   6
RC, non-engineered       7
Brick, mud mortar        8
Stone, cement mortar     9
Bamboo                  10
Other                   11
NaN                     -2
dtype: int64}])),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(max_depth=6, random_state=42))])

OrdinalEncoder(cols=['land_surface_condition', 'foundation_type', 'roof_type',
                     'ground_floor_type', 'other_floor_type', 'position',
                     'plan_configuration', 'superstructure'],
               mapping=[{'col': 'land_surface_condition',
                         'data_type': dtype('O'),
                         'mapping': Flat              1
Moderate slope    2
Steep slope       3
NaN              -2
dtype: int64},
                        {'col': 'foundation_type', 'data_type': dtype('O'),
                         'mapping': Mud mo...
                         'mapping': Rectangular                         1
T-shape                             2
Square                              3
L-shape                             4
Multi-projected                     5
U-shape                             6
E-shape                             7
H-shape                             8
Others                              9
Building with Central Courtyard    10
NaN                                -2
dtype: int64},
                        {'col': 'superstructure', 'data_type': dtype('O'),
                         'mapping': Stone, mud mortar        1
Stone                    2
RC, engineered           3
Brick, cement mortar     4
Adobe/mud                5
Timber                   6
RC, non-engineered       7
Brick, mud mortar        8
Stone, cement mortar     9
Bamboo                  10
Other                   11
NaN                     -2
dtype: int64}])

DecisionTreeClassifier(max_depth=6, random_state=42)


acc_train = accuracy_score(y_train, model.predict(X_train))
acc_val = model.score(X_val, y_val)

print("Training Accuracy:", round(acc_train, 2))
print("Validation Accuracy:", round(acc_val, 2))

Training Accuracy: 0.72
Validation Accuracy: 0.72


test_acc = model.score(X_test, y_test)
print("Test Accuracy:", round(test_acc, 2))

Test Accuracy: 0.72


# Create larger figure
fig, ax = plt.subplots(figsize=(25, 12))
# Plot tree
plot_tree(
    decision_tree=model.named_steps["decisiontreeclassifier"],
    feature_names=X_train.columns,
    filled=True,  # Color leaf with class
    rounded=True,  # Round leaf edges
    proportion=True,  # Display proportion of classes in leaf
    max_depth=3,  # Only display first 3 levels
    fontsize=12,  # Enlarge font
    ax=ax,  # Place in figure axis
);


# Assign the feature names and importances of my model to the variables
features = X_train.columns   # For the features, I can get them from the column names in my training set
importances = model.named_steps["decisiontreeclassifier"].feature_importances_  # For the importances, I access the feature_importances_ attribute of my model's DecisionTreeClassifier

print("Features:", features[:3])
print("Importances:", importances[:3])

Features: Index(['age_building', 'plinth_area_sq_ft', 'height_ft_pre_eq'], dtype='object')
Importances: [0.03515085 0.04618639 0.08839161]


# Create a pandas Series named feat_imp, where the index is features and the values are my importances
feat_imp = pd.Series(importances, index=features).sort_values()  # The Series should be sorted from smallest to largest importance
feat_imp.head()

position                  0.000644
plan_configuration        0.004847
foundation_type           0.005206
roof_type                 0.007620
land_surface_condition    0.020759
dtype: float64


feat_imp.sum()

0.9999999999999999


# Create horizontal bar chart
feat_imp.plot(kind="barh")
plt.xlabel("Gini Importance")  # Label x-axis "Gini Importance"
plt.ylabel("Feature");

Prepare Data¶

Import¶

Split¶

Build Model¶

Baseline¶

Iterate¶

Evaluate¶

Communicate¶

	age_building	plinth_area_sq_ft	height_ft_pre_eq	land_surface_condition	foundation_type	roof_type	ground_floor_type	other_floor_type	position	plan_configuration	superstructure	severe_damage
b_id
164002	20	560	18	Flat	Mud mortar-Stone/Brick	Bamboo/Timber-Light roof	Mud	TImber/Bamboo-Mud	Not attached	Rectangular	Stone, mud mortar	0
164081	21	200	12	Flat	Mud mortar-Stone/Brick	Bamboo/Timber-Light roof	Mud	TImber/Bamboo-Mud	Not attached	Rectangular	Stone, mud mortar	0
164089	18	315	20	Flat	Mud mortar-Stone/Brick	Bamboo/Timber-Light roof	Mud	TImber/Bamboo-Mud	Not attached	Rectangular	Stone, mud mortar	0
164098	45	290	13	Flat	Mud mortar-Stone/Brick	Bamboo/Timber-Light roof	Mud	TImber/Bamboo-Mud	Not attached	Rectangular	Stone, mud mortar	0
164103	21	230	13	Flat	Mud mortar-Stone/Brick	Bamboo/Timber-Light roof	Mud	TImber/Bamboo-Mud	Not attached	Rectangular	Stone, mud mortar	0