# Data Handling
import numpy as np
import pandas as pd

# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# Utilities
from skopt.space import Real, Categorical, Integer

# Basic Machine Learning
import xgboost as xgb
from skopt import BayesSearchCV
from pdpbox import pdp, get_dataset, info_plots
from lime.lime_tabular import LimeTabularExplainer
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve,
                             auc, precision_recall_curve, make_scorer, confusion_matrix, plot_roc_curve)
# Deep Learning
from tensorflow import keras
from tensorflow.keras.optimizers import SGD


# Loading in the dataset with Pandas
stroke = pd.read_csv('stroke.csv')

# View the first 3 rows of the dataset
stroke.head(3)


# Make the id column the index of the DataFrame
stroke.index = stroke['id']
del stroke['id']

# Make all the column names lowercase
stroke.columns = stroke.columns.str.lower()


# Find out where the are missing values
print(stroke.isnull().any())

# Determine the number of missing values
missing_values = stroke['bmi'].isnull().sum()
print(f'\nThere are {missing_values} missing values in the bmi column\n')

# Drop the rows with the missing values
stroke.dropna(inplace=True)

# Check out the datatypes of the data
print(stroke.info())

gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
residence_type       False
avg_glucose_level    False
bmi                   True
smoking_status       False
stroke               False
dtype: bool

There are 201 missing values in the bmi column

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4909 entries, 9046 to 44679
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4909 non-null   object 
 1   age                4909 non-null   float64
 2   hypertension       4909 non-null   int64  
 3   heart_disease      4909 non-null   int64  
 4   ever_married       4909 non-null   object 
 5   work_type          4909 non-null   object 
 6   residence_type     4909 non-null   object 
 7   avg_glucose_level  4909 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     4909 non-null   object 
 10  stroke             4909 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 460.2+ KB
None


# Check the values present in the gender column
stroke['gender'].value_counts()

Female    2897
Male      2011
Other        1
Name: gender, dtype: int64


# Removal of "other"
stroke.drop(stroke[stroke['gender'] == 'Other'].index[0], inplace=True)

# Do a dummy encoding of the genders
stroke['gender'].replace({'Female': 1, 'Male': 0}, inplace=True)


# More men than women had strokes
stroke.groupby('gender').mean()[['age', 'stroke']]


# Check the values present in the ever_married column
stroke['ever_married'].value_counts()

Yes    3204
No     1704
Name: ever_married, dtype: int64


# Do a dummy encoding of the ever_married column
stroke['ever_married'].replace({'Yes': 1, 'No': 0}, inplace=True)


# Check the values present in the ever_married column
stroke['work_type'].value_counts()

Private          2810
Self-employed     775
children          671
Govt_job          630
Never_worked       22
Name: work_type, dtype: int64


# Dummy encoding into 5 columns
dummy_variables = pd.get_dummies(stroke['work_type'])
del stroke['work_type']
 
# Assign new columns to the dataframe
stroke['govt_job'] = dummy_variables['Govt_job']
stroke['never_worked'] = dummy_variables['Never_worked']
stroke['private'] = dummy_variables['Private']
stroke['self-employed'] = dummy_variables['Self-employed']
stroke['children'] = dummy_variables['children']


# Check the values present in the residence_type column
stroke['residence_type'].value_counts()

Urban    2490
Rural    2418
Name: residence_type, dtype: int64


# Do a dummy encoding of the residence_type column
stroke['residence_type'].replace({'Urban': 1, 'Rural': 0}, inplace=True)


# Check the values present in the smoking_status column
stroke['smoking_status'].value_counts()

never smoked       1852
Unknown            1483
formerly smoked     836
smokes              737
Name: smoking_status, dtype: int64


# Dummy encoding into 4 columns
dummy_variables = pd.get_dummies(stroke['smoking_status'])
del stroke['smoking_status']

# Assign new columns to the dataframe
stroke['never_smoked'] = dummy_variables['never smoked']
stroke['formerly_smoked'] = dummy_variables['formerly smoked']
stroke['smokes'] = dummy_variables['smokes']
stroke['unknown_smoker'] = dummy_variables['Unknown']


# Reordering columns
labels = list(stroke.columns)
labels.append('stroke')
del labels[8]
stroke = stroke.reindex(columns=labels)


# Saving the data.
stroke.to_csv('stroke_clean.csv')


# Loading back in the cleaned data
stroke_clean = pd.read_csv('stroke_clean.csv', index_col='id')

# Printing out stroke-percentages
print("Percentage of stroke: ", np.round((stroke_clean[stroke_clean.stroke == 1].shape[0]/stroke_clean.shape[0]) * 100, 2), "%")
print("Percentage of non-stroke: ", np.round((stroke_clean[stroke_clean.stroke == 0].shape[0]/(stroke_clean.shape[0]) * 100), 2), "%")

Percentage of stroke:  4.26 %
Percentage of non-stroke:  95.74 %


# Visualize the age of people with stroke versus people without stroke
sns.histplot(
    data=stroke_clean,
    x="age", 
    hue="stroke", 
    multiple="dodge"
).set_title("Age/stroke distribution")

Text(0.5, 1.0, 'Age/stroke distribution')


# Show how much information we have about smokers.
information_about_smoker = np.where(stroke_clean.unknown_smoker == 1, "unknown smoker", "information about smoking")
sns.histplot(data=information_about_smoker)
plt.show()


# Correlation heatmap
sns.heatmap(stroke_clean.corr())
plt.show()


# Necessary functions for evaluating models

def curves(model_fit, x_test, y_test):
    baseline_probs = [0 for _ in range(len(y_test))]
    probs = model_fit.predict_proba(x_test)[:, 1]

    # calculate scores
    baseline_auc = roc_auc_score(y_test, baseline_probs)
    pc_auc = roc_auc_score(y_test, probs)

    # calculate roc curves
    baseline_fpr, baseline_tpr, _ = roc_curve(y_test, baseline_probs)
    fpr, tpr, thresholds = roc_curve(y_test, probs, drop_intermediate = False)
    
    fig, axs = plt.subplots(1, 2, figsize=(13, 5))
    
    axs[0].plot(baseline_fpr, baseline_tpr, linestyle = "--", color = "lightgrey")
    axs[0].plot(fpr, tpr, marker = ".", label = "ROC AUC=%.3f" % (pc_auc), color = "orange")
    axs[0].set_xlabel("False Positive Rate")
    axs[0].set_ylabel("True Positive Rate")
    axs[0].set_title("ROC Curve")
    axs[0].legend()
    
    # Calculating precision-recall curves
    probs = model_fit.predict_proba(x_test)[:, 1]
    
    # predict class values
    precision, recall, _ = precision_recall_curve(y_test, probs)
    no_skill = len(y_test[y_test==1]) / len(y_test)
    
    yhat = model_fit.predict(x_test)
    model_f1, model_auc = f1_score(y_test, yhat), auc(recall, precision)
    
    axs[1].plot([0, 1], [no_skill, no_skill], linestyle='--', color = "lightgray")
    axs[1].plot(recall, precision, marker=".", label="f1=%.3f auc=%.3f" % (model_f1, model_auc), color = "orange")
    axs[1].set_xlabel("Recall")
    axs[1].set_ylabel("Precision")
    axs[1].set_title("Precision-Recall Curve")
    axs[1].legend()
    
    plt.show()

def train_error(model_fit, X_train, y_train):
    train_pred = model_fit.predict(X_train)

    print("Train errors:")
    print()
    print("accuracy: %.3f" % accuracy_score(y_train, train_pred))
    print("precision: %.3f" % precision_score(y_train, train_pred, zero_division = 0))
    print("recall: %.3f" % recall_score(y_train, train_pred, zero_division = 0))
    print("f1: %.3f" % f1_score(y_train, train_pred, zero_division = 0))
    print()
    
def test_error(model_fit, X_test, y_test):
    test_pred = model_fit.predict(X_test)

    print("Test errors:")
    print()
    print("accuracy: %.3f" % accuracy_score(y_test, test_pred))
    print("precision: %.3f" % precision_score(y_test, test_pred, zero_division = 0))
    print("recall: %.3f" % recall_score(y_test, test_pred, zero_division = 0))
    print("f1: %.3f" % f1_score(y_test, test_pred, zero_division = 0))

    cm_test = confusion_matrix(y_test, test_pred)
    group_names = ["TN", "FP", "FN", "TP"]
    group_counts = ["{0:0.0f}".format(value) for value in cm_test.flatten()]
    labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names, group_counts)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cm_test, annot=labels, fmt = "", annot_kws={"size":16}, linewidths = 0.5, cmap="YlGnBu")


# Split into data and response
predictors = stroke_clean.drop('stroke', axis=1)
response = stroke_clean.stroke

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    predictors, 
    response, 
    test_size=0.25,
    random_state=0
)


# Add Ridge regression with parameter chosen by 10-fold CV
logregfit = LogisticRegressionCV(random_state=0, max_iter=1000, cv=10, class_weight="balanced").fit(X_train, y_train)

# Obtain the coefficients in a table
table_of_coefficients = pd.DataFrame(X_train.columns)
table_of_coefficients["Coefficients"] = logregfit.coef_.transpose()
table_of_coefficients.head()


print("Train accuracy: ", logregfit.score(X_train, y_train))
print("Test accuracy: ", logregfit.score(X_test, y_test))

Train accuracy:  0.7459929367019832
Test accuracy:  0.7302363488182559


# Plotting the curves
curves(logregfit, X_test, y_test)


# Displaying test-errors
test_error(logregfit, X_test, y_test)

Test errors:

accuracy: 0.730
precision: 0.113
recall: 0.837
f1: 0.199


# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Making the layers
model = keras.models.Sequential([
    keras.layers.Input(shape=[17]),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dropout(rate=0.5),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dropout(rate=0.5),
    keras.layers.Dense(1, activation="sigmoid")
])

# Checking out the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 100)               1800      
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
=================================================================
Total params: 12,001
Trainable params: 12,001
Non-trainable params: 0
_________________________________________________________________


# Compiling the model
model.compile(
    loss="binary_crossentropy",
    optimizer='SGD',
    metrics=[
        keras.metrics.Recall(), 
        keras.metrics.Precision(), 
        'accuracy'
    ]
)

# Choosing a weight
class_weight = {0: 1., 1: 22.}

# Fitting the model
model.fit(X_train_scaled, y_train, class_weight=class_weight, epochs=5)

Epoch 1/5
116/116 [==============================] - 1s 1ms/step - loss: 1.2904 - recall: 0.6565 - precision: 0.0517 - accuracy: 0.4670    
Epoch 2/5
116/116 [==============================] - 0s 1ms/step - loss: 1.2591 - recall: 0.6779 - precision: 0.0809 - accuracy: 0.6261
Epoch 3/5
116/116 [==============================] - 0s 1ms/step - loss: 1.0525 - recall: 0.6357 - precision: 0.0782 - accuracy: 0.7013  
Epoch 4/5
116/116 [==============================] - 0s 1ms/step - loss: 1.0441 - recall: 0.6971 - precision: 0.0953 - accuracy: 0.7028
Epoch 5/5
116/116 [==============================] - 0s 1000us/step - loss: 1.0487 - recall: 0.6752 - precision: 0.1098 - accuracy: 0.7433

<tensorflow.python.keras.callbacks.History at 0x14736f1c0>


# Testing the model
results = model.evaluate(X_test_scaled, y_test)

print("\nRecall: ", round(results[1], 3))
print("Precision: ", round(results[2], 3))
print("Accuracy: ", round(results[3], 3))

39/39 [==============================] - 0s 944us/step - loss: 0.5164 - recall: 0.8163 - precision: 0.1133 - accuracy: 0.7376

Recall:  0.816
Precision:  0.113
Accuracy:  0.738


# Setting up parameters
parameters = {
    "criterion": ["gini", "entropy"], 
    "max_depth": range(1, 8),          
    "min_samples_split": range(2, 8),
    "min_samples_leaf": range(1, 8)
    }

# Creating the decision tree
decisiontree = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=23),
    param_grid=parameters,
    scoring="accuracy",
    cv=3
)

# Fit the desicion tree
decisiontree_fit = decisiontree.fit(X_train, y_train)

# Print out the best parameters
print("Best hyperparameter:")
print(decisiontree_fit.best_params_)

Best hyperparameter:
{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 2}


# Display test error statistics
test_error(decisiontree_fit, X_test, y_test)

Test errors:

accuracy: 0.960
precision: 0.000
recall: 0.000
f1: 0.000


# Plot ROC and Precision-Recall Curve
curves(decisiontree_fit, X_test, y_test)


# Setting up parameters
parameters = {
    "criterion": ["gini", "entropy"], 
    "max_depth": range(1, 8),
    "min_samples_split": range(2, 8),
    "min_samples_leaf": range(1, 8),
    "class_weight": [None, "balanced"]
}

# Creating the decision tree
balanced_tree = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=23),
    param_grid=parameters,
    scoring="balanced_accuracy", 
    cv=3
)

# Fit the desicion tree
balanced_tree_fit = balanced_tree.fit(X_train, y_train)

# Print out the best parameters
print("Best hyperparameters:")
print(balanced_tree_fit.best_params_)

Best hyperparameters:
{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 2}


# Display test error statistics
test_error(balanced_tree, X_test, y_test)

Test errors:

accuracy: 0.522
precision: 0.074
recall: 0.959
f1: 0.138


# Plot ROC and Precision-Recall Curve
curves(balanced_tree, X_test, y_test)


# Setting up parameters
parameters = {
    'criterion': ['gini', 'entropy'], 
    'n_estimators': [1, 5, 10, 30, 50, 100],
    'max_features': ('sqrt', 8, 12),
    'max_depth': range(1, 10)
}

# Creating the random forest
randomforest = GridSearchCV(
    estimator=RandomForestClassifier(random_state = 23, bootstrap = True),
    param_grid=parameters,
    scoring="balanced_accuracy",
    cv=3
)

# Fit the random forest
randomforest_fit = randomforest.fit(X_train, y_train)

# Print out the best parameters
print("Best hyperparameters:")
print(randomforest_fit.best_params_)

Best hyperparameters:
{'criterion': 'gini', 'max_depth': 9, 'max_features': 8, 'n_estimators': 1}


# Display test error statistics
test_error(randomforest, X_test, y_test)

Test errors:

accuracy: 0.942
precision: 0.077
recall: 0.041
f1: 0.053


# Plot ROC and Precision-Recall Curve
curves(randomforest, X_test, y_test)


# Setting up parameters
parameters = {
    'n_estimators': [1, 10, 30, 50, 100],
    'max_features': ('sqrt', 8, 12),
    'max_depth': range(1, 10),
    'class_weight': ["balanced", "balanced_subsampled"]
}

# Creating the weighted random forest
weighted_forest = GridSearchCV(
    estimator=RandomForestClassifier(
        random_state=23, 
        criterion='gini',
        bootstrap=True
    ),
    param_grid=parameters,
    scoring="balanced_accuracy",
    n_jobs=8,
    cv=3
)

# Fit the weighted random forest
weighted_forest_fit = weighted_forest.fit(X_train, y_train)

# Print out the best parameters
print("Best hyperparameters:")
print(weighted_forest.best_params_)

Best hyperparameters:
{'class_weight': 'balanced', 'max_depth': 1, 'max_features': 12, 'n_estimators': 10}


# Display test error statistics
test_error(weighted_forest, X_test, y_test)

Test errors:

accuracy: 0.773
precision: 0.119
recall: 0.735
f1: 0.205


# Plot ROC and Precision-Recall Curve
curves(weighted_forest, X_test, y_test)


# Setting up parameters
parameters = {
    'n_estimators': [1, 10, 30, 50, 100],
    'max_features': ('sqrt', 8, 12),
    'max_depth': range(1, 10),
    'class_weight': [None, "balanced", "balanced_subsampled"]
}

# Creating the balanced random forest
balanced_forest = GridSearchCV(
    estimator=BalancedRandomForestClassifier(
        random_state=23,
        criterion='gini',
        bootstrap=True
    ),
    param_grid=parameters,
    scoring="balanced_accuracy",
    n_jobs=8, 
    cv=3
)

# Fit the weighted random forest
balanced_forest_fit = balanced_forest.fit(X_train, y_train)

# Print out the best parameters
print("Best hyperparameters:")
print(balanced_forest.best_params_)

Best hyperparameters:
{'class_weight': None, 'max_depth': 1, 'max_features': 8, 'n_estimators': 10}


# Display test error statistics
test_error(balanced_forest, X_test, y_test)

Test errors:

accuracy: 0.619
precision: 0.085
recall: 0.878
f1: 0.156


# Plot ROC and Precision-Recall Curve
curves(balanced_forest, X_test, y_test)


# Setting parameters for the search
search_space = {
    'max_depth': Integer(1, 10),
    'n_estimators': Integer(1, 100),
    'max_features': Integer(1, 12)
}

# Creating the balanced random forest
brandomforestBS = BayesSearchCV(
    estimator=BalancedRandomForestClassifier(
        random_state=41, 
        bootstrap=True
    ),
    search_spaces=search_space,
    scoring="balanced_accuracy",
    n_iter=64,
    cv=3,
    n_jobs=8,
    n_points=8                         
)

# Fit the balanced random forest
brandomforestBS_fit = brandomforestBS.fit(X_train, y_train)

# Print out the best parameters
print("Best hyperparameters:")
print(brandomforestBS.best_params_)

Best hyperparameters:
OrderedDict([('max_depth', 1), ('max_features', 11), ('n_estimators', 67)])


# Display test error statistics
test_error(brandomforestBS, X_test, y_test)

Test errors:

accuracy: 0.597
precision: 0.086
recall: 0.939
f1: 0.157


# Plot ROC and Precision-Recall Curve
curves(brandomforestBS, X_test, y_test)


# Setting the parameters
parameters = {
    'n_estimators': [10, 30, 50, 100],
    'eta': [0.2, 0.4, 1.0],
    'max_depth': range(1, 10)
}

# Manually calculate balanced scale weighting
y_train_values = y_train.value_counts()
scale_weight = y_train_values[0]/y_train_values[1]

# Creating the boosting model
boosting = GridSearchCV(
    estimator=xgb.XGBClassifier(
        random_state=23, 
        booster="gbtree",
        objective="binary:logistic",
        use_label_encoder=False,
        eval_metric="logloss",
        scale_pos_weight=scale_weight
    ),
    param_grid=parameters,
    scoring="balanced_accuracy",
    n_jobs=8,
    cv=3
)

# Fit the boosted model
boosting_fit = boosting.fit(X_train, y_train)

# Print out the best parameters 
print("Best hyperparameters:")
print(boosting.best_params_)

Best hyperparameters:
{'eta': 0.4, 'max_depth': 1, 'n_estimators': 10}


# Display test error statistics
test_error(boosting, X_test, y_test)

Test errors:

accuracy: 0.694
precision: 0.100
recall: 0.837
f1: 0.179


# Plot ROC and Precision-Recall Curve
curves(boosting, X_test, y_test)


# Setting the search parameters
search_space = {
    'n_estimators': Integer(1, 100),
    'eta': Real(1e-4, 1e0),
    'max_depth': Integer(1, 10)
}

# Manually calculate balanced scale weighting
y_train_values = y_train.value_counts()
scale_weight = y_train_values[0]/y_train_values[1]

# Creating the boosting model
boostingBS = BayesSearchCV(
    estimator=xgb.XGBClassifier(
        random_state=23, 
        booster="gbtree",
        use_label_encoder=False,
        eval_metric="logloss", 
        scale_pos_weight=scale_weight
    ),
    search_spaces=search_space,
    n_iter=64, 
    scoring="balanced_accuracy",
    n_jobs=8,
    n_points=8,
    cv=3
)

# Fitting the boosting model
boostingBS_fit = boostingBS.fit(X_train, y_train)

# Print out the best parameters 
print("Best hyperparameters:")
print(boostingBS.best_params_)

Best hyperparameters:
OrderedDict([('eta', 0.1509874389214828), ('max_depth', 1), ('n_estimators', 30)])


# Display test error statistics
test_error(boostingBS, X_test, y_test)

Test errors:

accuracy: 0.688
precision: 0.099
recall: 0.837
f1: 0.176


# Plot ROC and Precision-Recall Curve
curves(boostingBS, X_test, y_test)


model = xgb.XGBClassifier(**boostingBS.best_params_, random_state=23, 
                          booster="gbtree", use_label_encoder=False,
                          eval_metric="logloss", scale_pos_weight=scale_weight)

modelfit = model.fit(X_train, y_train)
features = stroke_clean.columns[:-1]


# Plotting the importance of each feature
plt.figure()
plt.bar(features, model.feature_importances_)
plt.xticks(rotation=90)
plt.show()


# PDP plot for Age
pdp_age = pdp.pdp_isolate(model = model, dataset = stroke_clean, model_features = features, feature = "age")
fig, axes = pdp.pdp_plot(pdp_age, "age", plot_lines = True, plot_pts_dist = True)


# PDP plot for average glucose level
pdp_glucose = pdp.pdp_isolate(model = model, dataset = stroke_clean, model_features = features, feature = "avg_glucose_level")
fig, axes = pdp.pdp_plot(pdp_glucose, "avg_glucose_level", plot_lines = True, plot_pts_dist = True)


# PDP plot for hypertension
pdp_hyp = pdp.pdp_isolate(model = model, dataset = stroke_clean, model_features = features, feature = "hypertension")
fig, axes = pdp.pdp_plot(pdp_hyp, "hypertension", plot_lines = True)


# Creating an explainer
explainer = LimeTabularExplainer(
    training_data = np.array(X_train.iloc[:,:]),
    training_labels = np.array(y_train.iloc[:]),
    feature_names = features,
    class_names = "stroke",
    mode = "classification"
)


# Creating an instance
i = 1
obs = X_test.iloc[i, :]
true_class = y_test.iloc[i]
print(f"Observation no. {i} (true negative): ")
print("True class: ", true_class)
print("Predicted probabilities from model: ", model.predict_proba(X_test)[i])

# Displaying
exp = explainer.explain_instance(obs, model.predict_proba)
print(exp.as_pyplot_figure())

Observation no. 1 (true negative): 
True class:  0
Predicted probabilities from model:  [0.8114888  0.18851118]
Figure(432x288)


# Choosing a second observation
i = 2
obs = X_test.iloc[i, :]
true_class = y_test.iloc[i]
print(f"Observation no. {i} (false positive): ")
print("True class: ", true_class)
print("Predicted probabilities from model: ", model.predict_proba(X_test)[i])
# Displaying
exp = explainer.explain_instance(obs, model.predict_proba)
print(exp.as_pyplot_figure())

Observation no. 2 (false positive): 
True class:  0
Predicted probabilities from model:  [0.23165935 0.76834065]
Figure(432x288)


# Choosing a third observation
i = 10
obs = X_test.iloc[i, :]
true_class = y_test.iloc[i]
print(f"Observation no. {i} (true positive): ")
print("True class: ", true_class)
print("Predicted probabilities from model: ", model.predict_proba(X_test)[i])

# Displaying
exp = explainer.explain_instance(obs, model.predict_proba)
print(exp.as_pyplot_figure())

Observation no. 10 (true positive): 
True class:  1
Predicted probabilities from model:  [0.19809508 0.8019049 ]
Figure(432x288)


# Choosing the last observation
i = 543 # 213, 419
obs = X_test.iloc[i, :]
true_class = y_test.iloc[i]
print(f"Observation no. {i} (false negative): ")
print("True class: ", true_class)
print("Predicted probabilities from model: ", model.predict_proba(X_test)[i])

# Displaying
exp = explainer.explain_instance(obs, model.predict_proba)
print(exp.as_pyplot_figure())

Observation no. 543 (false negative): 
True class:  1
Predicted probabilities from model:  [0.8114888  0.18851118]
Figure(432x288)

	age	stroke
gender
0	42.049130	0.044257
1	43.437805	0.041422

Model	Tuning	Precision	Recall	Accuracy	ROC-AUC
Logistic Regression	Built-in CV	0.113	0.837	0.730	0.854
Deep Neural Network	-	0.086	0.918	0.609	-
Balanced Decision Tree	GridSearchCV	0.074	0.959	0.522	0.732
Random Forest	GridSearchCV	0.080	0.041	0.943	0.598
Weighted Random Forest	GridSearchCV	0.119	0.735	0.773	0.818
Balanced Random Forest	GridSearchCV	0.085	0.878	0.619	0.791
Balanced Random Forest	BayesianSearchCV	0.094	0.918	0.645	0.836
XgBoost	GridSearchCV	0.100	0.837	0.694	0.848
XgBoost	BayesianSearchCV	0.093	0.878	0.654	0.850

stroke-prediction

We analyze a stroke dataset and formulate advanced statistical models for predicting whether a person has had a stroke based on measurable predictors.

Stroke Prediction¶

Using Deep Neural Networks, Three-Based Metods, and Explainable AI¶

Information About the Data¶

More on the Different Variables¶

Importing Packages¶

Cleaning the Data¶

Gender¶

Ever Married¶

Work Type¶

Residence Type¶

Smoking Status¶

Exploratory Data Analysis¶

Utility Functions¶

Two Simple Models¶

Logistic Regression (with Ridge Penalty)¶

A Simple Deep Neural Network¶

Discussion on Evaluating Performance and Hyperparameters¶

Evaluating Performance¶

Hyperparameter tuning¶

Tree-Based Models¶

Decision tree¶

Random Forests¶

Weighted Random Forest¶

Balanced Random Forest¶

Bayesian Search¶

XgBoost¶

Interpretation Through Explainable AI¶

Global methods¶

Local Methods¶

Comparing the different models.¶

	id	gender	age	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	9046	Male	67.0	1	Yes	Private	Urban	228.69	36.6	formerly smoked	1
1	51676	Female	61.0	0	Yes	Self-employed	Rural	202.21	NaN	never smoked	1
2	31112	Male	80.0	1	Yes	Private	Rural	105.92	32.5	never smoked	1

	0	Coefficients
0	gender	0.080890
1	age	0.079970
2	hypertension	0.623286
3	heart_disease	0.242387
4	ever_married	-0.034158