autogluon_classify.py

# Script with Autogluon modules

import pandas as pd
import os, sys
import matplotlib
import matplotlib.pyplot as plt
import loaders_multilabel as mll
import time
from autogluon.tabular import TabularDataset
from autogluon.tabular import TabularPredictor as task
from autogluon.core.utils import infer_problem_type
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix

## ------------------------------ Training ------------------------------#
# Trains multiple models and ensembles them. Fits NN and tress ensembles.
# - Size of validation data determines VARIANCE of performance estimates
# - Bias: no. of modeling decisions based on validation performance
# K fold CV better for smaller datasets.
# AG ignores classes with < 10(changeable threshold) instances
# NOTE: You can specify proportion of validation set manually: fit(): tuning_data=validation_data
# dataf: train features with target column; target: name of target column
def train_main(dataf, targetcol, malinst, hostfts):
	agdir = os.getcwd()+'/AGmodels'
	dir = agdir+"/"+str(malinst)+"_"+str(hostfts)+"/"
	if not os.path.exists(dir):
		os.system("mkdir "+dir)

	predictor = task(label=targetcol, path=dir, eval_metric='balanced_accuracy').fit(dataf, verbosity=4)
	return predictor

# Multi layer stacking takes predictions of base models and feeds to stack models
# AG will auto choose k= 10 fold cv, n=20 bagging repeats,
# L: 2 layers of models in stack followed by weighted-ensemble (higher weight for the model that performed well);
# Aggregate model predictions based on model weights and produce final prediction
def train_multilayerstacking(traindf, target,malinst, hostfts):
	dir = os.getcwd()+'/AGmodels/stacked/'+str(malinst)+"_"+str(hostfts)+"/"
	if not os.path.exists(dir):
		os.system("mkdir "+dir)
	predstack = task(label=target, path=dir, eval_metric='balanced_accuracy').fit(train_data= traindf, auto_stack=True, verbosity=3)
	return predstack


# ------------------------- Test functions ------------------------------#
# Prediction is done with model with best validation performance. AG creates
# ensembles to create maximized validation performance.
def test_main(Xtest, ytest, pred, testdf, traindf, calcftimpo=False):
	modelperf = pred.leaderboard(testdf, silent= True)
	print("[*]Model performance breakdown on Test data:")
	print(modelperf)
	ypred = pred.predict(Xtest)
	ypredproba = pred.predict_proba(Xtest)
	perf = pred.evaluate_predictions(y_true=ytest, y_pred=ypred, auxiliary_metrics= True)
	print("[*]Predictions: ", ypred)
	print("[*]Confidence in predictions:\n")
	print(pd.DataFrame(ypredproba, columns=pred.class_labels))
	# Each model score
	print("Perf: ", perf)
	print("Getting confusion matrix.....")
	cmatrix = confusion_matrix(ytest, ypred).ravel().tolist()
	print(cmatrix)
	auc_score = roc_auc_score(ytest, ypredproba.iloc[:, 1])
	print("AUC score for best model: ", auc_score)

	if calcftimpo:
		ftimpo = None
		ftimpo = pred.feature_importance(traindf)
		print("Feature Importance on test data: ", ftimpo)
	else:
		ftimpo = None
	bestmodel = pred.get_model_best()
	return modelperf, ftimpo, cmatrix, ytest, ypredproba, bestmodel, perf, auc_score

def test_NN(Xtest, ytest, pred):
	ypred = pred.predict(Xtest, model="NeuralNetClassifier")
	perf = pred.evaluate_predictions(y_true=ytest, y_pred=ypred, auxiliary_metrics= True)
	print("Predictions: ", ypred)
	return

# Validation performance
def result_summary(pred):
	results = pred.fit_summary()
	print("[*]Summary of models fitting: ", results)
	return

def test_stack(Xtest, ytest, predstack, testdf, traindf, calcftimpo=False):
	ypred = predstack.predict(Xtest)
	ypredproba = predstack.predict_proba(Xtest)
	perf = predstack.evaluate_predictions(y_true=ytest, y_pred=ypred, auxiliary_metrics= True)
	print("[*]Predictions: ", ypred)
	test_perf = predstack.leaderboard(testdf, silent=True)
	print("$$$$$$$$ RESULT STACKING $$$$$$$$\n", test_perf)
	ftimpo = None
	if calcftimpo:
		ftimpo = predstack.feature_importance(traindf)
		print("Feature Importance on test data: ", ftimpo)
	auc_score = roc_auc_score(ytest, ypredproba.iloc[:, 1])
	cmatrix = confusion_matrix(ytest, ypred).ravel().tolist()
	print("Confusion matrix stacked: ", cmatrix)
	print("AUC using stacked model: ", auc_score)
	return test_perf, ftimpo, cmatrix, auc_score


def evaluate(Xtest, ytest, mpath):
	pred = task.load(mpath)
	allmodels = pred.get_model_names()
	bestmodel = pred.get_model_best()
	print("All model names: ", allmodels,"\nBest performing model(on validation set): ",bestmodel)
	print("Ytest: ", ytest[0:10])
	ypred = pred.predict(Xtest)
	ypredproba = pred.predict_proba(Xtest)
	print("Ypred: ", ypred[0:10])
	print("Ypredprob: ", ypredproba[0:10])

	perf = pred.evaluate_predictions(y_true=ytest, y_pred=ypredproba, auxiliary_metrics= True)
	tn, fp, fn, tp = confusion_matrix(ytest, ypred).ravel()
	fpr = fp/(fp+tn)
	tpr = tp/(tp+fn)
	print("tn:%d fp:%d fn:%d tp:%d"%(tn, fp, fn, tp))
	print("FPR: ", fpr*100)
	print("TPR: ", tpr*100)
	ap = average_precision_score(ytest, ypredproba.iloc[:, 1])
	print("Average Precision score: ", ap)
	print("Autogluon evaluation: ", perf)
	print("Len ytest/pred/predproba: ", ytest.size, ypred.size, ypredproba.size)

	return [ypred, ypredproba]


def zerodaytest(datadf, targetcol, malinst, hostfts, modelpath):
	print("Loading model trained with dataset: D",malinst, "HostFts?: ", hostfts)
	# split data to train/test
	featdf = datadf.iloc[:,:-1].copy() # drop target column
	labels = datadf.iloc[:,-1].copy()
	print("Zero day testing DF (after dropping labels): ", featdf)
	print("Model: ", modelpath)
	#print(featdf.size, labels.size)
	time.sleep(5)
	print(":ZERODAY TEST RESULTS:")
	[ypred, ypredproba] = evaluate(featdf, labels, modelpath)
	return

# Invokes AutoGluon training and testing modules with respective dataframes
def main_ag(traindf, testdf, targetcol, malinst, hostfts):

	# Displaying dataframe info
	Xtest = testdf.iloc[:,:-1].copy()
	ytest = testdf.iloc[:,-1].copy()
	maltrain = traindf[traindf['target'] == 1].shape
	maltest = testdf[testdf['target'] == 1].shape
	bentrain = traindf[traindf['target'] == 0].shape
	bentest = testdf[testdf['target'] == 0].shape
	print("Train df (w/ target): ",traindf, traindf.shape)
	print("Train mal: ", maltrain, "Train ben: ", bentrain)
	print("Test df (w/ target): ",testdf, testdf.shape)
	print("Test mal: ", maltest, "Test ben: ", bentest)
	time.sleep(2)


	# Training binary classifiers: 8 base models, 2 DL models
	predictor = train_main(traindf, targetcol, malinst, hostfts)
	predstack = train_multilayerstacking(traindf, targetcol, malinst, hostfts)

	# Testing binary classifiers
	print("###################~Testing Trained Models (30% PCAPs)~############################")
	res1, fimp1, cmatrix, ytest, ypred_proba, bestmodel, perf, auc_score = test_main(Xtest, ytest, predictor, testdf, traindf)
	# Uncomment for test results with feature importance (longer run time)
	##res1, fimp1, cmatrix, ytest, ypred_proba, bestmodel, perf, auc_score = test_main(Xtest, ytest, predictor, testdf, traindf, True)

	print("####################Stacking & Weighted Ensemble Testing###########################")
	res2, fimp2, cmatrixstacked, aucstacked = test_stack(Xtest, ytest, predstack, testdf, traindf)
	# With feature importance
	##res2, fimp2, cmatrixstacked, aucstacked = test_stack(Xtest, ytest, predstack, testdf, traindf, True)

	return [res1, res2, fimp1, fimp2, cmatrix, bestmodel, perf, auc_score]