In: Computer Science
Write a neural network in python for multiclass classification of an imbalanced dataset that makes a successful model that both trains and evaluates the model and prints the Accuracy, Precision, Recall, and F1 score. Data will be in the form of a CSV file with 600,000 samples (or rows in the CSV) of 15 classes (for y_train and y_test) and 78 input dimensions/features. The imbalance will have some classes that have a low of only 8 samples taken. Your job is the make a neural network to fix those imbalances and optimize the network for the best Accuracy, Precision, Recall, and F1 score for all 15 classes.
Sample data -
22 | 6 | 386359 | 22 | 20 | 1912 | 2665 | 640 | 0 | 86.90909 | 137.688 | 976 | 0 | 133.25 | 268.7713 | 11846.5 | 108.7072 | 9423.39 | 22717.21 | 122019 | 5 | 385442 | 18354.38 | 29591.49 | 122019 | 212 | 386353 | 20334.37 | 43298.97 | 161209 | 10 | 0 | 0 | 0 | 0 | 712 | 648 | 56.94186 | 51.76533 | 0 | 976 | 106.4419 | 207.2919 | 42969.92 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 108.9762 | 86.90909 | 133.25 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | 1912 | 20 | 2665 | 26883 | 230 | 16 | 32 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class 4 |
0 | 0 | 1.13E+08 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.026634 | 56319965 | 33.23402 | 56319988 | 56319941 | 1.13E+08 | 56319965 | 33.23402 | 56319988 | 56319941 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.026634 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | -1 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 56319965 | 33.23402 | 56319988 | 56319941 | Class 7 |
#Importing the numpy to perform Linear Algebraic operations on the data
import numpy as np
#Import pandas library to perform the data preprocessing
import pandas as pd
#importing the Keras deep learning framework of Python
import keras
#Importing the Sequential model from keras
from keras.models import Sequential
#Importing the types of layers in the Neural Network that we are going to have
from keras.layers import Dense
#Importing the train_test_split function which is useful in dividing the dataset into the training and testing data
from sklearn.model_selection import train_test_split
#Importing the StandardScaler function to perform the standardisation/scaling of the data
from sklearn.preprocessing import StandardScaler
#Importing the metries for the performance evaluation of our deep learning model
from sklearn import metrics
def data_preprocessing(data):
#As you have told that there are 78 features hence the 78th index position will be having the class labels
#This is because the indexing starts from 0 in Pandas in Python
X = data.iloc[:, 0:78]
y = data.iloc[:78]
#I have splitted the dataset into a ratio of 80:20 between the train and test you can try other ratios as well
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 23)
#Creating an object of StandardScaler
sc = StandardScaler()
#Scaling the data using the StandardScaler() object
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
return X_train, X_test, y_train, y_test
def train_and_evaluate(X_train, X_test, y_train, y_test):
neural_classifier = Sequential()
#output_dim = number of nuerons in first hidden layer
#init = initializing the weights of the neural network
#input_dim = number of neuron in the input layer = number of input features = 78 in your case
#Actiavtion = activation function that is used in each layer
#Dense is the type of layer
#Input layer
neural_classifier.add(Dense(output_dim = 100, init = 'uniform', activation = 'relu', input_dim = 78))
#First hidden layer
neural_classifier.add(Dense(output_dim = 150, init = 'uniform', activation = 'relu'))
#Second hidden layer
neural_classifier.add(Dense(output_dim = 200, init = 'uniform', activation = 'relu'))
#Third hidden layer
neural_classifier.add(Dense(output_dim = 250, init = 'uniform', activation = 'relu'))
#Fourth hidden layer
neural_classifier.add(Dense(output_dim = 300, init = 'uniform', activation = 'relu'))
#Fifth hidden layer
neural_classifier.add(Dense(output_dim = 350, init = 'uniform', activation = 'relu'))
#Sixth hidden layer
neural_classifier.add(Dense(output_dim = 400, init = 'uniform', activation = 'relu'))
#Seventh hidden layer
neural_classifier.add(Dense(output_dim = 250, init = 'uniform', activation = 'relu'))
#Eighth hidden layer
neural_classifier.add(Dense(output_dim = 300, init = 'uniform', activation = 'relu'))
#Output layer
#output layer has 15 neurons because there are 15 classes in your dataset
#Since it is a multiclass classification problem hence we are using the softmax activation function
#If it was a binary classification problem then we could have used thee sigmoid activation function
neural_classifier.add(Dense(output_dim = 15, init = 'uniform', activation = 'softmax'))
#Optimizer is ADAM which is a kind of optimization algorithm for minimizing the loss on each and every epoch
neural_classifier.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
#Epochs = number of times we will train our network
neural_classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)
#Predicting the labels for the test data
y_pred = neural_classifier.predict(X_test)
#Calculating the accuracy score
accuracy = metrics.accuracy_score(y_test, y_pred)
#Calculating the precision score
precision = metrics.precision_score(y_test, y_pred)
#Calculating the recall score
recall = metrics.recall_score(y_test, y_pred, average='weighted')
#Calculating the f1-score
f1score = metrics.f1_score(y_test, y_pred, average='weighted')
return accuracy, precision, recall, f1score
def main():
data = pd.read_csv('Filename.csv')
X_train, X_test, y_train, y_test = data_preprocessing(data)
accuracy, precision, recall, f1_score = train_and_evaluate(X_train, X_test, y_train, y_test)
print("Accuracy score of the model is :", accuracy)
print("precision score of the model is :", accuracy)
print("Recall score of the model is :", accuracy)
print("f1-score of the model is :", accuracy)
main()
screenshots: