main.py

# Copyright 2017 Abien Fred Agarap
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========================================================================

"""Main program implementing the deep learning algorithms"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

__version__ = '0.1.0'
__author__ = 'Abien Fred Agarap'

import argparse
from models.cnn_svm import CNN
from models.gru_svm import GruSvm
from models.mlp_svm import MLP
import numpy as np
from sklearn.model_selection import train_test_split
from utils.data import load_data
from utils.data import one_hot_encode

BATCH_SIZE = 256
CELL_SIZE = 256
DROPOUT_RATE = 0.85
LEARNING_RATE = 1e-3
NODE_SIZE = [512, 256, 128]
NUM_LAYERS = 5


def parse_args():
    parser = argparse.ArgumentParser(
        description='Deep Learning Using Support Vector Machine for Malware Classification')
    group = parser.add_argument_group('Arguments')
    group.add_argument('-m', '--model', required=True, type=int,
                       help='[1] CNN-SVM, [2] GRU-SVM, [3] MLP-SVM')
    group.add_argument('-d', '--dataset', required=True, type=str,
                       help='the dataset to be used')
    group.add_argument('-n', '--num_epochs', required=True, type=int,
                       help='number of epochs')
    group.add_argument('-c', '--penalty_parameter', required=True, type=float,
                       help='the SVM C penalty parameter')
    group.add_argument('-k', '--checkpoint_path', required=True, type=str,
                       help='path where to save the trained model')
    group.add_argument('-l', '--log_path', required=True, type=str,
                       help='path where to save the TensorBoard logs')
    group.add_argument('-r', '--result_path', required=True, type=str,
                       help='path where to save actual and predicted labels array')
    arguments = parser.parse_args()
    return arguments


def main(arguments):

    model_choice = arguments.model
    assert model_choice == 1 or model_choice == 2 or model_choice == 3,\
        'Invalid choice: Choose among 1, 2, and 3 only.'

    dataset = np.load(arguments.dataset)

    features, labels = load_data(dataset=dataset)

    labels = one_hot_encode(labels=labels)

    # get the number of features
    num_features = features.shape[1]

    # get the number of classes
    num_classes = labels.shape[1]

    # split the dataset by 70/30
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.30,
                                                                                stratify=labels)

    train_size = int(train_features.shape[0])
    train_features = train_features[:train_size-(train_size % BATCH_SIZE)]
    train_labels = train_labels[:train_size-(train_size % BATCH_SIZE)]

    test_size = int(test_features.shape[0])
    test_features = test_features[:test_size - (test_size % BATCH_SIZE)]
    test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)]

    if model_choice == 1:
        model = CNN(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, num_classes=num_classes,
                    penalty_parameter=arguments.penalty_parameter, sequence_length=num_features)
        model.train(checkpoint_path=arguments.checkpoint_path, log_path=arguments.log_path,
                    result_path=arguments.result_path, epochs=arguments.num_epochs,
                    train_data=[train_features, train_labels], train_size=int(train_features.shape[0]),
                    test_data=[test_features, test_labels], test_size=int(test_features.shape[0]))
    elif model_choice == 2:
        train_features = np.reshape(train_features, (train_features.shape[0],
                                                     int(np.sqrt(train_features.shape[1])),
                                                     int(np.sqrt(train_features.shape[1]))))
        test_features = np.reshape(test_features, (test_features.shape[0],
                                                   int(np.sqrt(test_features.shape[1])),
                                                   int(np.sqrt(test_features.shape[1]))))
        model = GruSvm(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, cell_size=CELL_SIZE, dropout_rate=DROPOUT_RATE,
                       num_classes=num_classes, num_layers=NUM_LAYERS, sequence_height=train_features.shape[2],
                       sequence_width=train_features.shape[1], svm_c=arguments.penalty_parameter)
        model.train(checkpoint_path=arguments.checkpoint_path, log_path=arguments.log_path, epochs=arguments.num_epochs,
                    train_data=[train_features, train_labels], train_size=int(train_features.shape[0]),
                    test_data=[test_features, test_labels], test_size=int(test_features.shape[0]),
                    result_path=arguments.result_path)
    elif model_choice == 3:
        model = MLP(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, node_size=NODE_SIZE, num_classes=num_classes,
                    num_features=num_features, penalty_parameter=arguments.penalty_parameter)
        model.train(checkpoint_path=arguments.checkpoint_path, num_epochs=arguments.num_epochs,
                    log_path=arguments.log_path, train_data=[train_features, train_labels],
                    train_size=int(train_features.shape[0]), test_data=[test_features, test_labels],
                    test_size=int(test_features.shape[0]), result_path=arguments.result_path)


if __name__ == '__main__':
    args = parse_args()

    main(args)