I'm using the UNSW-NB15 dataset, which is really imbalanced, to train a Multi-class classificationMLP. I encode the categorical features of the dataset which leads to a 2+ million x 205 columns dataframe. After creating a sequential model with several layers i execute the code and the loss values for each epoch is too high. I have searched for ways to balance my dataset and maybe solve that issue but none have worked. Here is the full code, including the preprocess of the dataset, i hope anyone can advice me in how to balance the dataset for better results:
import pandas as pd
from tensorflow.keras.utils import get_file
dfs = []
for i in range(1,5):
path = './UNSW-NB15_{}.csv'# There are 4 input csv files
dfs.append(pd.read_csv(path.format(i),dtype={'attack_cat': str, 'ct_ftp_cmd' : int }, header = None))
all_data = pd.concat(dfs).reset_index(drop=True) # Concat all to a single df
# This csv file contains names of all the features
df_col = pd.read_csv('./NUSW-NB15_features.csv', encoding='ISO-8859-1')
# Making column names lower case, removing spaces
df_col['Name'] = df_col['Name'].apply(lambda x: x.strip().replace(' ', '').lower())
# Renaming our dataframe with proper column names
all_data.columns = df_col['Name']
# display 5 rows
pd.set_option('display.max_columns', 48)
pd.set_option('display.max_rows', 21)
all_data
all_data['attack_cat'] = all_data['attack_cat'].str.strip()
all_data['attack_cat'] = all_data['attack_cat'].replace(['Backdoors'], 'Backdoor')
all_data.groupby('attack_cat')['attack_cat'].count()
all_data["attack_cat"] = all_data["attack_cat"].fillna('Normal')
all_data.groupby('attack_cat')['attack_cat'].count()
all_data.drop(all_data[all_data['is_ftp_login'] >= 2.0].index, inplace = True)
all_data.drop(['srcip', 'sport', 'dstip', 'dsport', 'ct_ftp_cmd'],axis=1, inplace=True)
df = pd.concat([all_data,pd.get_dummies(all_data['proto'],prefix='proto')],axis=1)
df.drop('proto',axis=1, inplace=True)
df_2 = pd.concat([df,pd.get_dummies(df['state'],prefix='state')],axis=1)
df_2.drop('state',axis=1, inplace=True)
df_encoded = pd.concat([df_2,pd.get_dummies(df_2['service'],prefix='service')],axis=1)
df_encoded.drop('service',axis=1, inplace=True)
df_encoded['ct_flw_http_mthd'] = df_encoded['ct_flw_http_mthd'].fillna(0)
df_encoded['is_ftp_login'] = df_encoded['is_ftp_login'].fillna(0)
df_encoded = df.drop('label', axis=1)
x_columns = df.columns.drop('attack_cat')
x = df[x_columns].values
dummies = pd.get_dummies(df['attack_cat'])
products = dummies.columns
y = dummies.values
import numpy as np
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import metrics
from sklearn.utils import class_weight
from sklearn.metrics import precision_score , recall_score
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.25, random_state=42)
x_cast = tf.cast(x_train,dtype=tf.float32)
model = Sequential()
model.add(Dense(203, input_dim= x.shape[1], activation= 'relu'))
model.add(Dense(407, activation= 'relu'))
model.add(Dense(407,activation= 'relu'))
model.add(Dense(y_train.shape[1],activation= 'softmax', kernel_initializer='normal'))
model.compile(loss= 'categorical_crossentropy', optimizer= 'adam', metrics= ['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5,
verbose=1, mode='auto', restore_best_weights=True)
model.fit(x_cast,y_train,validation_data=(x_test,y_test),
callbacks=[monitor],verbose=2,batch_size= 10000,epochs=1000)
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_compare = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_compare, pred)
print("Accuracy score: {}".format(score))