Notebook example
Experimentation: preprocessing and models training
Experimentation: preprocessing and models training
Let’s use the mortgage dataset which is stemming from the Federal Financial Institutions Examination Council as the result of the Home Mortgage Disclosure Act: since 1975, lending institutions are required to report public loan data. We will use a sample from 2016 to predict if an application was approved (1) or denied (0).
[ ]:
import numpy as np
import collections
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
import xgboost as xgb
xgb.set_config(verbosity=0)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')
Data Preprocessing
[ ]:
COLUMN_NAMES = collections.OrderedDict({
'as_of_year': np.int16,
'agency_code': 'category',
'loan_type': 'category',
'property_type': 'category',
'loan_purpose': 'category',
'occupancy': np.int8,
'loan_amt_thousands': np.float64,
'preapproval': 'category',
'county_code': np.float64,
'applicant_income_thousands': np.float64,
'purchaser_type': 'category',
'hoepa_status': 'category',
'lien_status': 'category',
'population': np.float64,
'ffiec_median_fam_income': np.float64,
'tract_to_msa_income_pct': np.float64,
'num_owner_occupied_units': np.float64,
'num_1_to_4_family_units': np.float64,
'approved': np.int8
})
def preprocessing(data):
data = pd.read_csv(data, index_col=False, dtype=COLUMN_NAMES)
data = data.dropna()
data = shuffle(data, random_state=2)
labels = data['approved']
data_dropped_approved = data.drop(columns=['approved', 'purchaser_type'])
dummy_columns = list(data_dropped_approved.dtypes[data.dtypes == 'category'].index)
data_dropped_approved = pd.get_dummies(data_dropped_approved, columns=dummy_columns)
x,y = data_dropped_approved,labels.values
x_train,x_test,y_train,y_test = train_test_split(x,y, random_state=2)
x_train = x_train.drop(columns='Unnamed: 0')
missing = [elt for elt in x_test.columns if elt not in x_train.columns]
for col in missing:
x_test = x_test.drop(columns= col)
assert x_test.columns.any() == x_train.columns.any()
return x_train, x_test, y_train, y_test
[ ]:
X_train, X_test, y_train, y_test = preprocessing("mortgage_extra_small.csv")
Now let’s prepare a pytorch dataloader
[ ]:
class TrainData(Dataset):
def __init__(self, X_data, y_data):
self.X_data = X_data
self.y_data = y_data
def __getitem__(self, index):
return self.X_data[index], self.y_data[index]
def __len__ (self):
return len(self.X_data)
train_data = TrainData(torch.FloatTensor(X_train.to_numpy()),
torch.FloatTensor(y_train))
train_loader = DataLoader(dataset=train_data,
batch_size=64,
shuffle=True)
Models Training
Let’s train 4 competing models models: an xgboost, a RandomForst, a logistic regression and a fully connected pytorch model
[ ]:
xboost_depth_3 = xgb.XGBClassifier(
objective='binary:logistic',
max_depth = 3
).fit(X_train, y_train, verbose = False)
[ ]:
rf = RandomForestClassifier(
n_estimators=50,
max_depth=7,
random_state=0
).fit(X_train, y_train)
[ ]:
lr = LogisticRegression(random_state=0).fit(X_train, y_train)
[ ]:
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
# Number of input features is 34.
self.layer_1 = nn.Linear(34, 64)
self.layer_2 = nn.Linear(64, 64)
self.layer_out = nn.Linear(64, 1)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.1)
self.batchnorm1 = nn.BatchNorm1d(64)
self.batchnorm2 = nn.BatchNorm1d(64)
def forward(self, inputs):
x = self.relu(self.layer_1(inputs))
x = self.batchnorm1(x)
x = self.relu(self.layer_2(x))
x = self.batchnorm2(x)
x = self.dropout(x)
x = self.layer_out(x)
return x
[ ]:
model_mlp = MLP()
LEARNING_RATE = 0.001
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model_mlp.parameters(), lr=LEARNING_RATE)
[ ]:
EPOCHS = 50
for e in range(EPOCHS):
epoch_loss = 0
epoch_acc = 0
for X_batch, y_batch in train_loader:
optimizer.zero_grad()
y_pred = model_mlp(X_batch)
loss = criterion(y_pred, y_batch.unsqueeze(1))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
Use althiqa’s API to create your report
Use althiqa’s API to create your report
[ ]:
import althiqa_lib
Login
[ ]:
url = 'http://ec2-15-188-65-181.eu-west-3.compute.amazonaws.com:8000'
pwd='password_demo0!'
email = 'victor.storchan@althiqa.io'
sess = althiqa_lib.Session(url, email, pwd )
Create your project
[ ]:
project = sess.create_project("Credit Scoring Mortgage21",
X_train,
y_train,
X_test,
y_test,
project_type="classif" )
[ ]:
project2 = sess.get_project('Credit Scoring Mortgage12')
Push your models to evaluate, compare and rank them
[ ]:
project.push_model('xgboost3', xboost, threshold = 0.5)
[ ]:
project.push_model('xboost_depth_3', xboost_depth_3, threshold = 0.5)
[ ]:
project.push_model('random forest', rf, threshold = 0.5)
[ ]:
project.push_model('logistic regression', lr, threshold = 0.5)
[ ]:
class WrapperPytorchClassif(BaseEstimator):
def __init__(self, model =None, X=None, y=None):
self.model = model
def predict_proba(self, X):
if type(X) != torch.Tensor:
X = torch.FloatTensor(X.to_numpy())
y_test_pred = self.model(X)
y_pred_tag = torch.sigmoid(y_test_pred)
y = pd.DataFrame(y_pred_tag.detach().numpy())
y = y.to_numpy()
y = np.asarray([[1-float(elt), float(elt)] for elt in y])
return y
[ ]:
Wrapped_MLP = WrapperPytorchClassif(model = model_mlp)
[ ]:
project.push_model('MLP', Wrapped_MLP, threshold = 0.5)
Push any custom metrics that are relevant to the project
[ ]:
#let's define a custom ROI metric
def ROI_custom(y_pred, y_test, x_test):
interest_rate = 0.05
num_years = 10
cumul_roi = 0
for i in range(len(y_pred)):
if y_pred[i] == 0 and y_test[i] == 0:
pass
if y_pred[i] == 1 and y_test[i] == 0:
cumul_roi -= x_test.iloc[i]["loan_amt_thousands"]
if y_pred[i] == 1 and y_test[i] == 1:
cumul_roi+= x_test.iloc[i]["loan_amt_thousands"]*((1+interest_rate)**num_years-1)/interest_rate
if y_pred[i] == 0 and y_test[i] == 1:
cumul_roi-= x_test.iloc[i]["loan_amt_thousands"]*((1+interest_rate)**num_years-1)/interest_rate
return cumul_roi
[ ]:
project.create_metric('ROI', ROI_custom)