Network Intrusion Detection using UNSW-NB15.py

# -*- coding: utf-8 -*-
"""Original DNN & XGBoost --- CNN on UNSW NB 15.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1q6xQ4N_I6sFCOEiMZMk5PotmD0tc8SJh

# DATA PreProcessing
"""

# Commented out IPython magic to ensure Python compatibility.
# data cleaning and plots
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
# %matplotlib inline

# sklearn: data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

# sklearn: train model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, classification_report

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from google.colab import drive
drive.mount('/content/drive')

# Load data from CSV file
test_df = pd.read_csv('/content/drive/Shareddrives/CyberSecurity Dataset/UNSW-NB15 - CSV Files/a part of training and testing set/UNSW_NB15_test-set.csv')
train_df = pd.read_csv('/content/drive/Shareddrives/CyberSecurity Dataset/UNSW-NB15 - CSV Files/a part of training and testing set/UNSW_NB15_training-set.csv')
data = pd.concat([test_df, train_df], ignore_index=True)
# data = df

# Data cleaning
data = data.dropna()

# Feature engineering
encoder = LabelEncoder()
data['attack_cat'] = encoder.fit_transform(data['attack_cat'])
data.drop(columns=['id', 'proto', 'service', 'state'], inplace=True)
X = data.iloc[:, :-2].values
y = data.iloc[:, -2].values

# Feature engineering
X = data.drop(['attack_cat'], axis=1)
y = data['attack_cat']
data1 = X

# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Feature selection
selector = SelectKBest(score_func=f_classif, k=20)
X = selector.fit_transform(X, y)

data.shape

data1.shape

# get the list of selected features
selected_features_mask = selector.get_support()
selected_features = list(data1.columns[selected_features_mask])
print(selected_features)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

"""# NEW Section
# DNN Model
"""

# Build DNN model
dnn_model = Sequential()
dnn_model.add(Dense(units=64, activation='relu', input_dim=20))
dnn_model.add(Dense(units=32, activation='relu'))
dnn_model.add(Dense(units=10, activation='softmax'))
dnn_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])

# this is because of the binary or multicalss classification
#dnn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


# Train DNN model
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
dnn_model.fit(X_train, pd.get_dummies(y_train), validation_data=(X_test, pd.get_dummies(y_test)), epochs=50, batch_size=32, callbacks=[early_stop])

# Print the input shape of the DNN model
print("Input shape of DNN model:", dnn_model.layers[0].input_shape)

from sklearn.metrics import classification_report

# Assuming you have trained your DNN model and obtained predictions
dnn_probs = dnn_model.predict(X_test)
dnn_probs_classes = np.argmax(dnn_probs, axis=1)  # Convert probabilities to class labels

# Assuming you have the true labels for the test data
y_true = y_test  # Replace y_test with your true labels

# Generate the classification report
report = classification_report(y_true, dnn_probs_classes)

# Print the classification report
print(report)

import pickle

# Assume that 'model' is your trained CNN model
with open('/content/drive/MyDrive/ML on Cyber Security Dataset/Working March 2023/dnn_model.pkl', 'wb') as f:
    pickle.dump(dnn_model, f)

"""# New Section
# XGBoost Model
"""

import xgboost as xgb
from sklearn.metrics import classification_report

# Create an instance of the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Train the XGBoost model
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
xgboost_y_pred = xgb_model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, xgboost_y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Get the predicted probabilities from the XGBoost model
xgboost_probs = xgb_model.predict_proba(X_test)

# Print the classification report
report = classification_report(y_test, xgboost_y_pred)
print(report)

import pickle

# Assume that 'model' is your trained CNN model
with open('/content/drive/MyDrive/ML on Cyber Security Dataset/Working March 2023/xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

"""DNN and XG boost are ensembled before
and now CNN model is moved below

# New Section
# Ensemble DNN and XGBoost
"""

# Combine the probabilities by averaging
ensemble_probs = (xgboost_probs + dnn_probs) / 2

# Apply softmax normalization
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / np.sum(e_x, axis=1, keepdims=True)

ensemble_probs = softmax(ensemble_probs)

import pickle

# Assume that 'model' is your trained CNN model
with open('/content/drive/MyDrive/ML on Cyber Security Dataset/Working March 2023/ensemble_probs.pkl', 'wb') as f:
    pickle.dump(ensemble_probs, f)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming you have the true labels for the test data as y_test

# Convert ensemble probabilities into class labels
ensemble_labels = np.argmax(ensemble_probs, axis=1)

"""## Performance Matrix"""

# Calculate accuracy
accuracy = accuracy_score(y_test, ensemble_labels)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, ensemble_labels)
print("Classification Report:")
print(report)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, ensemble_labels)
print("Confusion Matrix:")
print(conf_matrix)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from tensorflow.keras.utils import to_categorical

# Assuming you have the true labels for the test data as y_test
# Assuming you have the ensemble probabilities from the ensemble model as ensemble_probs

# Convert ensemble probabilities into class labels
ensemble_labels = np.argmax(ensemble_probs, axis=1)

# Convert true labels into one-hot encoded format
y_true = to_categorical(y_test)

# Define the number of classes
num_classes = y_true.shape[1]

# Define class label names
label_names = {
     0: "Normal",
    1: "Reconnaissance",
    2: "Backdoor",
    3: "DoS",
    4: "Exploits",
    5: "Analysis",
    6: "Fuzzers",
    7: "Worms",
    8: "Shellcode",
    9: "Generic"
}

# Calculate the FPR, TPR, and thresholds for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], ensemble_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot all ROC curves in one graph
plt.figure()
for i in range(num_classes):
    plt.plot(fpr[i], tpr[i], label='{} (AUC = {:.2f})'.format(label_names[i], roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()
# Save the ROC curve as an image file
plt.savefig('All_roc_curve.png'.format(i))  # Change the file name as desired
plt.close()

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from tensorflow.keras.utils import to_categorical

# Assuming you have the true labels for the test data as y_test
# Assuming you have the ensemble probabilities from the ensemble model as ensemble_probs

# Convert ensemble probabilities into class labels
ensemble_labels = np.argmax(ensemble_probs, axis=1)

# Convert true labels into one-hot encoded format
y_true = to_categorical(y_test)

# Define the number of classes
num_classes = y_true.shape[1]

# Define class label names
label_names = {
    0: "Normal",
    1: "Reconnaissance",
    2: "Backdoor",
    3: "DoS",
    4: "Exploits",
    5: "Analysis",
    6: "Fuzzers",
    7: "Worms",
    8: "Shellcode",
    9: "Generic"
}

# Calculate the FPR, TPR, and thresholds for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], ensemble_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    # Plot the ROC curve for each class
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='{} (AUC = {:.2f})'.format(label_names[i], roc_auc[i]))
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - {}'.format(label_names[i]))
    plt.legend(loc='lower right')
    plt.show()
    # Save the ROC curve as an image file
    plt.savefig('roc_curve_class{}.png'.format(i))  # Change the file name as desired
    plt.close()

"""# Practice Work
# CNN Model
"""

# Build CNN model
cnn_model = Sequential()
cnn_model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(20, 1)))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(units=64, activation='relu'))
cnn_model.add(Dense(units=10, activation='softmax'))
cnn_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])

# Reshape data for CNN model
X_train_cnn = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

# Train CNN model
cnn_model.fit(X_train_cnn, pd.get_dummies(y_train), validation_data=(X_test_cnn, pd.get_dummies(y_test)), epochs=50, batch_size=32, callbacks=[early_stop])

# Print the input shape of the DNN model
print("Input shape of CNN model:", cnn_model.layers[0].input_shape)

import pickle

# Assume that 'model' is your trained CNN model
with open('/content/drive/MyDrive/ML on Cyber Security Dataset/Working March 2023/cnn_model.pkl', 'wb') as f:
    pickle.dump(cnn_model, f)

# Print model summary
print("CNN Model Summary:")
print(cnn_model.summary())
print("\nDNN Model Summary:")
print(dnn_model.summary())

"""# OLD Section
# CNN and DNN Ensemble Model
"""

from keras.layers import Reshape

# Ensemble model
ensemble_model = Sequential()
ensemble_model.add(Dense(units=64, activation='relu', input_dim=20))
ensemble_model.add(Reshape((64, 1)))  # Reshape layer
ensemble_model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
ensemble_model.add(MaxPooling1D(pool_size=2))
ensemble_model.add(Flatten())
ensemble_model.add(Dense(units=64, activation='relu'))
ensemble_model.add(Dense(units=10, activation='softmax'))
ensemble_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])

#Getting an error in this setion so add new section Before
'''
# Ensemble model
ensemble_model = Sequential()
ensemble_model.add(Dense(units=64, activation='relu', input_dim=20))
ensemble_model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(20, 1)))
ensemble_model.add(MaxPooling1D(pool_size=2))
ensemble_model.add(Flatten())
ensemble_model.add(Dense(units=64, activation='relu'))
ensemble_model.add(Dense(units=10, activation='softmax'))
ensemble_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
'''

from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten, concatenate
from tensorflow.keras.models import Model

input1 = Input(shape=(20,))
dnn_layer1 = Dense(units=64, activation='relu')(input1)
dnn_layer2 = Dense(units=32, activation='relu')(dnn_layer1)

input2 = Input(shape=(20, 1))
cnn_layer1 = Conv1D(filters=32, kernel_size=3, activation='relu')(input2)
cnn_layer2 = MaxPooling1D(pool_size=2)(cnn_layer1)
cnn_layer3 = Flatten()(cnn_layer2)

merged = concatenate([dnn_layer2, cnn_layer3])
ensemble_output = Dense(units=10, activation='softmax')(merged)

ensemble_model = Model(inputs=[input1, input2], outputs=ensemble_output)
ensemble_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])

ensemble_model.fit([X_train, X_train_cnn], pd.get_dummies(y_train), validation_data=([X_test, X_test_cnn], pd.get_dummies(y_test)), epochs=50, batch_size=32, validation_split=0.2)

import pickle

# Assume that 'model' is your trained CNN model
with open('/content/drive/MyDrive/ML on Cyber Security Dataset/Working March 2023/ensemble_model.pkl', 'wb') as f:
    pickle.dump(ensemble_model, f)

"""
# Performance Marix"""

# Evaluate the model on test data
ensemble_loss, ensemble_acc = ensemble_model.evaluate([X_test, X_test_cnn], pd.get_dummies(y_test))

# Get predictions for test data
y_pred = ensemble_model.predict([X_test, X_test_cnn])

# Convert predictions to classes
y_pred_classes = np.argmax(y_pred, axis=1)

# Get true classes
y_true = y_test

# Calculate confusion matrix
cm = confusion_matrix(y_true, y_pred_classes)

plt.figure(figsize=(20, 20))

# Print metrics
print("Test loss:", ensemble_loss)
print("Test accuracy:", ensemble_acc)
print("Confusion matrix:")
print(cm)

# Define the class labels for your data
class_labels = ['Normal', 'Reconnaissance', 'Backdoor', 'DoS', 'Exploits',
       'Analysis', 'Fuzzers', 'Worms', 'Shellcode', 'Generic']

plt.figure(figsize=(10, 10))

# Visualize the confusion matrix using a heatmap
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion matrix for Ensemble DNN & CNN')
plt.xlabel('Predicted class')
plt.ylabel('True class')
plt.show()

# Evaluate ensemble model
ensemble_loss, ensemble_acc = ensemble_model.evaluate([X_test, X_test_cnn], pd.get_dummies(y_test))
#ensemble_loss, ensemble_acc = ensemble_model.evaluate(X_test, y_test)
#print("Ensemble model - loss: {:.3f}, accuracy: {:.3f}".format(ensemble_loss, ensemble_acc))


# Evaluate the model on test data
y_pred = ensemble_model.predict([X_test, X_test_cnn])
#y_pred = ensemble_model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
print("Accuracy of the Ensemble Model: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("Confusion Matrix of the Ensemble Model:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report of the Ensemble Model:")
print(classification_report(y_test, y_pred))

from sklearn.metrics import classification_report

# Get predictions for test data
y_pred = ensemble_model.predict([X_test, X_test_cnn])

# Convert predictions to classes
y_pred_classes = np.argmax(y_pred, axis=1)

# Get true classes
y_true = y_test

# Generate classification report
report = classification_report(y_true, y_pred_classes)

# Print classification report
print(report)

# Define the class labels for your data
class_labels = ['Normal', 'Reconnaissance', 'Backdoor', 'DoS', 'Exploits',
       'Analysis', 'Fuzzers', 'Worms', 'Shellcode', 'Generic']
plt.figure(figsize=(20, 20))

# Visualize the confusion matrix using a heatmap
sns.heatmap(cm/total_samples*100, annot=True, cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion matrix for Ensamble')
plt.xlabel('Predicted class')
plt.ylabel('True class')
plt.show()

from sklearn.metrics import confusion_matrix

# Predict on test set using CNN model
y_pred_cnn = cnn_model.predict(X_test)
y_pred_cnn = np.argmax(y_pred_cnn, axis=1)

# Get confusion matrix for each model
cm_cnn = confusion_matrix(y_test, y_pred_cnn)

'''
# Evaluate DNN model
dnn_loss, dnn_accuracy = dnn_model.evaluate(X_test, y_test)
print("DNN Loss:", dnn_loss)
print("DNN Accuracy:", dnn_accuracy)
'''
plt.figure(figsize=(20, 20))

print("CNN Confusion Matrix:")
print(cm_cnn)

# Predict on test set using DNN model
y_pred_dnn = dnn_model.predict(X_test)
y_pred_dnn = np.argmax(y_pred_dnn, axis=1)

cm_dnn = confusion_matrix(y_test, y_pred_dnn)

'''
# Evaluate CNN model
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test, y_test)
print("CNN Loss:", cnn_loss)
print("CNN Accuracy:", cnn_accuracy)
'''
plt.figure(figsize=(20, 20))

print("DNN Confusion Matrix:")
print(cm_dnn)

# Define the class labels for your data
class_labels = ['Normal', 'Reconnaissance', 'Backdoor', 'DoS', 'Exploits',
       'Analysis', 'Fuzzers', 'Worms', 'Shellcode', 'Generic']

total_samples = sum(sum(cm_cnn))

plt.figure(figsize=(20, 20))

# Visualize the confusion matrix using a heatmap
sns.heatmap(cm_cnn/total_samples*100, annot=True, cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion matrix for CNN')
plt.xlabel('Predicted class')
plt.ylabel('True class')
plt.show()


# Define the class labels for your data
class_labels = ['Normal', 'Reconnaissance', 'Backdoor', 'DoS', 'Exploits',
       'Analysis', 'Fuzzers', 'Worms', 'Shellcode', 'Generic']

total_samples = sum(sum(cm_dnn))d

# Visualize the confusion matrix using a heatmap
sns.heatmap(cm_dnn/total_samples*100, annot=True, cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion matrix for DNN')
plt.xlabel('Predicted class')
plt.ylabel('True class')
plt.show()

# predict the test set using the ensemble model
y_pred = ensemble_model.predict([X_test, X_test_cnn])

# convert the predictions to class labels
y_pred = np.argmax(y_pred, axis=1)

# convert the test set labels to class labels
y_true = np.argmax(pd.get_dummies(y_test).values, axis=1)

# calculate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

# calculate the classification report
cr = classification_report(y_true, y_pred)
print("Classification Report:\n", cr)

# calculate the area under the ROC curve
y_prob = ensemble_model.predict_proba([X_test, X_test_cnn])
fpr, tpr, thresholds = roc_curve(y_true, y_prob[:,1])
roc_auc = auc(fpr, tpr)
print("Area Under the ROC Curve:", roc_auc)

# calculate the recall, precision, and F1 score
recall = recall_score(y_true, y_pred, average='macro')
precision = precision_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

# calculate the accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

"""# New Section
# Navie Bayes
"""

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Assuming you have preprocessed and split your dataset into X_train, X_test, y_train, y_test

# Instantiate a Gaussian Naive Bayes model
naive_bayes = GaussianNB()

# Train the model
naive_bayes.fit(X_train, y_train)

# Make predictions on the test data
y_pred = naive_bayes.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

from sklearn.metrics import accuracy_score

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


"""# New Section
# Logistic Regression
"""

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from joblib import parallel_backend

# Create a pipeline with SVM
pipeline = make_pipeline(StandardScaler(), SVC())

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf']
}

# Perform grid search with parallel processing
with parallel_backend('multiprocessing'):
    grid_search = GridSearchCV(pipeline, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions with the best model
y_pred = best_model.predict(X_test)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

"""# New Section
# Navie bayes
"""


from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Assuming you have preprocessed and split your dataset into X_train, X_test, y_train, y_test

# Instantiate a Gaussian Naive Bayes model
naive_bayes = GaussianNB()

# Train the model
naive_bayes.fit(X_train, y_train)

# Make predictions on the test data
y_pred = naive_bayes.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


"""# New Section"""