Earthquake Damage In Gorkha🇳🇵
Part 3: Predicting Damage with Decision Trees
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
import sqlite3
import warnings
import matplotlib.pyplot as plt
import pandas as pd
from category_encoders import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.utils.validation import check_is_fitted
warnings.simplefilter(action="ignore", category=FutureWarning)
GOALS
Machine Learning Workflow
Prepare Data
Build Model
Communicate Results
def wrangle(db_path):
# Connect to database
conn = sqlite3.connect(db_path)
# Construct query
query = """
SELECT distinct(i.building_id) AS b_id,
s.*,
d.damage_grade
FROM id_map AS i
JOIN building_structure AS s ON i.building_id = s.building_id
JOIN building_damage AS d ON i.building_id = d.building_id
WHERE district_id = 4
"""
# Read query results into DataFrame
df = pd.read_sql(query, conn, index_col="b_id")
# Identify leaky columns
drop_cols = [col for col in df.columns if "post_eq" in col]
# Add high-cardinality / redundant column
drop_cols.append("building_id")
# Create binary target column
df["damage_grade"] = df["damage_grade"].str[-1].astype(int)
df["severe_damage"] = (df["damage_grade"] > 3).astype(int)
# Drop old target
drop_cols.append("damage_grade")
# Drop multicollinearity column
drop_cols.append("count_floors_pre_eq")
# Drop columns
df.drop(columns=drop_cols, inplace=True)
return df
# Use the wrangle function above to import my data set into the DataFrame df
df = wrangle("/home/jovyan/nepal.sqlite") # Set path to the SQLite database as "/home/jovyan/nepal.sqlite"
df.head()
age_building | plinth_area_sq_ft | height_ft_pre_eq | land_surface_condition | foundation_type | roof_type | ground_floor_type | other_floor_type | position | plan_configuration | superstructure | severe_damage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
b_id | ||||||||||||
164002 | 20 | 560 | 18 | Flat | Mud mortar-Stone/Brick | Bamboo/Timber-Light roof | Mud | TImber/Bamboo-Mud | Not attached | Rectangular | Stone, mud mortar | 0 |
164081 | 21 | 200 | 12 | Flat | Mud mortar-Stone/Brick | Bamboo/Timber-Light roof | Mud | TImber/Bamboo-Mud | Not attached | Rectangular | Stone, mud mortar | 0 |
164089 | 18 | 315 | 20 | Flat | Mud mortar-Stone/Brick | Bamboo/Timber-Light roof | Mud | TImber/Bamboo-Mud | Not attached | Rectangular | Stone, mud mortar | 0 |
164098 | 45 | 290 | 13 | Flat | Mud mortar-Stone/Brick | Bamboo/Timber-Light roof | Mud | TImber/Bamboo-Mud | Not attached | Rectangular | Stone, mud mortar | 0 |
164103 | 21 | 230 | 13 | Flat | Mud mortar-Stone/Brick | Bamboo/Timber-Light roof | Mud | TImber/Bamboo-Mud | Not attached | Rectangular | Stone, mud mortar | 0 |
# Create my feature matrix X and target vector y
target = "severe_damage" # My target is "severe_damage"
X = df.drop(columns=target)
y = df[target]
# Divide my data (X and y) into training and test sets using a randomized train-test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42 # My test set should be 20% of my total data & set a random_state for reproducibility
)
# Divide my training data (X_train and y_train) into training and validation sets using a randomized train-test split
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.2, random_state=42 # My validation data should be 20% of the remaining data
)
# Calculate the baseline accuracy score for my model
acc_baseline = y_train.value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline, 2))
Baseline Accuracy: 0.64
Now I want to create a pipeline named model
that contains a OrdinalEncoder
transformer and a DecisionTreeClassifier
predictor.
# Build Model
model = make_pipeline(
OrdinalEncoder(), DecisionTreeClassifier(random_state=42) # Set a random_state for my predictor
)
# Fit model to training data
model.fit(X_train, y_train)
Pipeline(steps=[('ordinalencoder', OrdinalEncoder(cols=['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'superstructure'], mapping=[{'col': 'land_surface_condition', 'data_type': dtype('O'), 'mapping': Flat 1 Moderate slope 2 Steep slope 3 NaN -2 dtype: int64}, {'col': 'foundation_type', 'dat... E-shape 7 H-shape 8 Others 9 Building with Central Courtyard 10 NaN -2 dtype: int64}, {'col': 'superstructure', 'data_type': dtype('O'), 'mapping': Stone, mud mortar 1 Stone 2 RC, engineered 3 Brick, cement mortar 4 Adobe/mud 5 Timber 6 RC, non-engineered 7 Brick, mud mortar 8 Stone, cement mortar 9 Bamboo 10 Other 11 NaN -2 dtype: int64}])), ('decisiontreeclassifier', DecisionTreeClassifier(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('ordinalencoder', OrdinalEncoder(cols=['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'superstructure'], mapping=[{'col': 'land_surface_condition', 'data_type': dtype('O'), 'mapping': Flat 1 Moderate slope 2 Steep slope 3 NaN -2 dtype: int64}, {'col': 'foundation_type', 'dat... E-shape 7 H-shape 8 Others 9 Building with Central Courtyard 10 NaN -2 dtype: int64}, {'col': 'superstructure', 'data_type': dtype('O'), 'mapping': Stone, mud mortar 1 Stone 2 RC, engineered 3 Brick, cement mortar 4 Adobe/mud 5 Timber 6 RC, non-engineered 7 Brick, mud mortar 8 Stone, cement mortar 9 Bamboo 10 Other 11 NaN -2 dtype: int64}])), ('decisiontreeclassifier', DecisionTreeClassifier(random_state=42))])
OrdinalEncoder(cols=['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'superstructure'], mapping=[{'col': 'land_surface_condition', 'data_type': dtype('O'), 'mapping': Flat 1 Moderate slope 2 Steep slope 3 NaN -2 dtype: int64}, {'col': 'foundation_type', 'data_type': dtype('O'), 'mapping': Mud mo... 'mapping': Rectangular 1 T-shape 2 Square 3 L-shape 4 Multi-projected 5 U-shape 6 E-shape 7 H-shape 8 Others 9 Building with Central Courtyard 10 NaN -2 dtype: int64}, {'col': 'superstructure', 'data_type': dtype('O'), 'mapping': Stone, mud mortar 1 Stone 2 RC, engineered 3 Brick, cement mortar 4 Adobe/mud 5 Timber 6 RC, non-engineered 7 Brick, mud mortar 8 Stone, cement mortar 9 Bamboo 10 Other 11 NaN -2 dtype: int64}])
DecisionTreeClassifier(random_state=42)
# Calculate the training and validation accuracy scores for my models
acc_train = accuracy_score(y_train, model.predict(X_train))
acc_val = model.score(X_val, y_val)
print("Training Accuracy:", round(acc_train, 2))
print("Validation Accuracy:", round(acc_val, 2))
Training Accuracy: 0.98 Validation Accuracy: 0.65
# Use the get_depth method on the DecisionTreeClassifier in my model to see how deep my tree grew during training
tree_depth = model.named_steps["decisiontreeclassifier"].get_depth()
print("Tree Depth:", tree_depth)
Tree Depth: 49
# Create a range of possible values for max_depth hyperparameter of my model's DecisionTreeClassifier
depth_hyperparams = range(1, 50, 2) # depth_hyperparams should range from 1 to 50 by steps of 2
I need a code that trains a model for every max_depth
in depth_hyperparams
. Every time a new model is trained, the code should also calculate the training and validation accuracy scores and append them to the training_acc
and validation_acc
lists, respectively.
# Create empty lists for training and validation accuracy scores
training_acc = []
validation_acc = []
for d in depth_hyperparams:
# Create model with `max_depth` of `d`
test_model = make_pipeline(
OrdinalEncoder(),
DecisionTreeClassifier(max_depth=d, random_state=42)
)
# Fit model to training data
test_model.fit(X_train, y_train)
# Calculate training accuracy score and append to `training_acc`
training_acc.append(test_model.score(X_train, y_train))
# Calculate validation accuracy score and append to `training_acc`
validation_acc.append(test_model.score(X_val, y_val))
print("Training Accuracy Scores:", training_acc[:3])
print("Validation Accuracy Scores:", validation_acc[:3])
Training Accuracy Scores: [0.7071072484228174, 0.7117395332421582, 0.7162394670666608] Validation Accuracy Scores: [0.7088406564319746, 0.7132521616375508, 0.7166049055937886]
Now I can create a visualization with two lines. The first line should plot the training_acc
values as a function of depth_hyperparams
, and the second should plot validation_acc
as a function of depth_hyperparams
.
# Plot `depth_hyperparams`, `training_acc`
plt.plot(depth_hyperparams, training_acc, label="training")
plt.plot(depth_hyperparams, validation_acc, label="validation")
plt.xlabel("Max Depth") # Label x-axis as "Max Depth"
plt.ylabel("Accuracy Score") # Label y-axis as "Accuracy Score"
plt.legend(); # Insert a legend so that the two lines can be distinguished
# Build Model
model = make_pipeline(
OrdinalEncoder(),
DecisionTreeClassifier(max_depth=6, random_state=42)
)
# Fit model to training data
model.fit(X_train, y_train)
Pipeline(steps=[('ordinalencoder', OrdinalEncoder(cols=['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'superstructure'], mapping=[{'col': 'land_surface_condition', 'data_type': dtype('O'), 'mapping': Flat 1 Moderate slope 2 Steep slope 3 NaN -2 dtype: int64}, {'col': 'foundation_type', 'dat... Others 9 Building with Central Courtyard 10 NaN -2 dtype: int64}, {'col': 'superstructure', 'data_type': dtype('O'), 'mapping': Stone, mud mortar 1 Stone 2 RC, engineered 3 Brick, cement mortar 4 Adobe/mud 5 Timber 6 RC, non-engineered 7 Brick, mud mortar 8 Stone, cement mortar 9 Bamboo 10 Other 11 NaN -2 dtype: int64}])), ('decisiontreeclassifier', DecisionTreeClassifier(max_depth=6, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('ordinalencoder', OrdinalEncoder(cols=['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'superstructure'], mapping=[{'col': 'land_surface_condition', 'data_type': dtype('O'), 'mapping': Flat 1 Moderate slope 2 Steep slope 3 NaN -2 dtype: int64}, {'col': 'foundation_type', 'dat... Others 9 Building with Central Courtyard 10 NaN -2 dtype: int64}, {'col': 'superstructure', 'data_type': dtype('O'), 'mapping': Stone, mud mortar 1 Stone 2 RC, engineered 3 Brick, cement mortar 4 Adobe/mud 5 Timber 6 RC, non-engineered 7 Brick, mud mortar 8 Stone, cement mortar 9 Bamboo 10 Other 11 NaN -2 dtype: int64}])), ('decisiontreeclassifier', DecisionTreeClassifier(max_depth=6, random_state=42))])
OrdinalEncoder(cols=['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'superstructure'], mapping=[{'col': 'land_surface_condition', 'data_type': dtype('O'), 'mapping': Flat 1 Moderate slope 2 Steep slope 3 NaN -2 dtype: int64}, {'col': 'foundation_type', 'data_type': dtype('O'), 'mapping': Mud mo... 'mapping': Rectangular 1 T-shape 2 Square 3 L-shape 4 Multi-projected 5 U-shape 6 E-shape 7 H-shape 8 Others 9 Building with Central Courtyard 10 NaN -2 dtype: int64}, {'col': 'superstructure', 'data_type': dtype('O'), 'mapping': Stone, mud mortar 1 Stone 2 RC, engineered 3 Brick, cement mortar 4 Adobe/mud 5 Timber 6 RC, non-engineered 7 Brick, mud mortar 8 Stone, cement mortar 9 Bamboo 10 Other 11 NaN -2 dtype: int64}])
DecisionTreeClassifier(max_depth=6, random_state=42)
acc_train = accuracy_score(y_train, model.predict(X_train))
acc_val = model.score(X_val, y_val)
print("Training Accuracy:", round(acc_train, 2))
print("Validation Accuracy:", round(acc_val, 2))
Training Accuracy: 0.72 Validation Accuracy: 0.72
Based on my visualization, I will choose the max_depth
value that leads to the best validation accuracy score. Then will retrain my original model with that max_depth
value. Lastly, I will check how my tuned model performs on my test set by calculating the test accuracy score below. This will help me resolve the overfitting problem with this new max_depth
.
test_acc = model.score(X_test, y_test)
print("Test Accuracy:", round(test_acc, 2))
Test Accuracy: 0.72
Now I need to create a code to use the plot_tree
function from scikit-learn to visualize the decision logic of my model.
# Create larger figure
fig, ax = plt.subplots(figsize=(25, 12))
# Plot tree
plot_tree(
decision_tree=model.named_steps["decisiontreeclassifier"],
feature_names=X_train.columns,
filled=True, # Color leaf with class
rounded=True, # Round leaf edges
proportion=True, # Display proportion of classes in leaf
max_depth=3, # Only display first 3 levels
fontsize=12, # Enlarge font
ax=ax, # Place in figure axis
);
# Assign the feature names and importances of my model to the variables
features = X_train.columns # For the features, I can get them from the column names in my training set
importances = model.named_steps["decisiontreeclassifier"].feature_importances_ # For the importances, I access the feature_importances_ attribute of my model's DecisionTreeClassifier
print("Features:", features[:3])
print("Importances:", importances[:3])
Features: Index(['age_building', 'plinth_area_sq_ft', 'height_ft_pre_eq'], dtype='object') Importances: [0.03515085 0.04618639 0.08839161]
# Create a pandas Series named feat_imp, where the index is features and the values are my importances
feat_imp = pd.Series(importances, index=features).sort_values() # The Series should be sorted from smallest to largest importance
feat_imp.head()
position 0.000644 plan_configuration 0.004847 foundation_type 0.005206 roof_type 0.007620 land_surface_condition 0.020759 dtype: float64
feat_imp.sum()
0.9999999999999999
Finally, I can create a horizontal bar chart with all the features in feat_imp
.
# Create horizontal bar chart
feat_imp.plot(kind="barh")
plt.xlabel("Gini Importance") # Label x-axis "Gini Importance"
plt.ylabel("Feature");