In: Computer Science
Data mining-->
Please Perform Principal Component Analysis and K-Means Clustering on the Give dataset Below. [50 Points]
Dataset Link : https://dataminingcsc6740.s3-us-west-2.amazonaws.com/datasets/homework_2.csv
10 Points for Data Preprocessing.
15 Points for PCA Algorithm along with plots and Results Explaination.
15 Points for K-Means Algorithm with plots and Results Explination.
10 Points for Comparing the results between PCA and K-Means and whats your infer- ence from your ouputs of the algorithms.
Hints:
As per the data preprocessing step convert all the variables in the
dataset into Numerical
values as the algorithms only work with Numerical values
Then Apply both algorithms one after the other then plot the output
clusters Compare the output clusters in both the steps.
In data mining, I have used R language platform as an IDE
The code is here with k means clustering and principal component analysis(PCA)
#load dataset
library(ggplot2)
library(ggthemes)
library(GGally)
library(dplyr)
library(Metrics)
homework_2 <- read.csv("C:/Users/CST2019/Downloads/homework_2.csv", sep=";")
# structure of dataset
str(homework_2)
#required libraries
require(dplyr)
# pre processing and convet categorical values to numeric values
homework_2$default<-as.numeric(homework_2$default)
str(homework_2)
homework_2$housing<-as.numeric(homework_2$housing)
str(homework_2)
homework_2$loan<-as.numeric(homework_2$loan)
str(homework_2)
homework_2$poutcome<-as.numeric(homework_2$poutcome)
str(homework_2)
homework_2$y<-as.numeric(homework_2$y)
str(homework_2)
homework_2$month<-as.numeric(homework_2$month)
str(homework_2)
homework_2$contact<-as.numeric(homework_2$contact)
str(homework_2)
homework_2$job<-as.numeric(homework_2$job)
str(homework_2)
homework_2$marital<-as.numeric(homework_2$marital)
str(homework_2)
homework_2$education<-as.numeric(homework_2$education)
str(homework_2)
#over all summary
summary(homework_2)
str(homework_2)
#plots
hist(homework_2$age)
hist(homework_2$marital)
hist(homework_2$education)
hist(homework_2$default)
hist(homework_2$balance)
hist(homework_2$housing)
hist(homework_2$loan)
#k means clustering
kmeans(homework_2, 3)
cor<-cor(homework_2)
library(corrplot)
#CORRELARTION MATRIX
corrplot(cor,method="number")
library(caTools)
sample = sample.split(homework_2,SplitRatio = 0.70) # splits the data in the ratio mentioned in SplitRatio. After splitting marks these rows as logical TRUE and the the remaining are marked as logical FALSE
train1 =subset(homework_2,sample ==TRUE) # creates a training dataset named train1 with rows which are marked as TRUE
test1=subset(homework_2, sample==FALSE)
summary(train1)
summary(test1)
#PCA algorithm
prin_comp <- prcomp(train1, scale. = T)
names(prin_comp)
prin_comp$center
prin_comp$scale
prin_comp$rotation
biplot(prin_comp, scale = 0)
std_dev <- prin_comp$sdev
std_dev
#compute variance
pr_var <- std_dev^2
pr_var
#check variance of first 10 components
pr_var[1:10]
#proportion of variance explained
prop_varex <- pr_var/sum(pr_var)
prop_varex[1:20]
#add a training set with principal components
train.data <- data.frame(pdays = train1$pdays, prin_comp$x)
#we are interested in first 10 PCAs
train.data <- train.data[,1:10]
#run a decision tree
install.packages("rpart")
library(rpart)
rpart.model <- rpart(pdays ~ . , data = train.data, method = "anova")
rpart.model
#transform test into PCA
test.data <- predict(prin_comp, newdata = test1)
test.data <- as.data.frame(test.data)
#select the first 10 components
test.data <- test.data[,1:10]
#make prediction on test data
rpart.prediction <- predict(rpart.model, test.data)