In: Advanced Math
Write your own R-function for the linear regression using qr() function. You must not use solve() function. The input of your function would be a design matrix X and a response vector y and the output must contain
- least sqaure estimates and the corresponding stnadard errors.
- residuals and MSE
- fitted values
- ANOVA table
##Importing Data set
library(readxl)
project_data <-
read.table("https://www.kaggle.com/kumarajarshi/life-expectancy-who.txt",header=T)
View(project_data)
attach(project_data)
##assign column names
y<- project_data$`Life expectancy`
x1<-project_data$Status
x2<-project_data$`Adult Mortality`
x3<-project_data$`infant deaths`
x4<-project_data$`Hepatitis B`
x5<-project_data$Measles
x6<-project_data$BMI
x7<-project_data$`under-five deaths`
x8<-project_data$Polio
x9<-project_data$Diphtheria
x10<-project_data$`HIV/AIDS`
x11<-project_data$GDP
x12<-project_data$Population
x13<-project_data$`thinness 1-19 years`
x14<-project_data$`thinness 5-9 years`
x15<-project_data$`Income composition of resources`
x16<-project_data$Schooling
##Plotting boxplot and density
for(c in project_data[,2:17])
{
boxplot(c)
plot(density(c))
polygon(density(c),col = "green", border = "red")
}
##Skewness of data
library(e1071)
for(c in project_data[,2:17])
{
print(skewness(c))
}
##Transformation for removing skewness
x3.1<- x3^(1/6)
x5.1<- x5^(1/5)
x7.1<- x7^(1/4)
x12.1<- log10(x12)
##Print Skewness values for transformed variables
print(skewness(x3.1))
print(skewness(x5.1))
print(skewness(x7.1))
print(skewness(x12.1))
##Plotting boxplot and density for transformed variables
boxplot(x3.1)
plot(density(x3.1))
polygon(density(x3.1), col = "green", border = "red")
boxplot(x5.1)
plot(density(x5.1))
polygon(density(x5.1), col = "green", border = "red")
boxplot(x7.1)
plot(density(x7.1))
polygon(density(x7.1), col = "green", border = "red")
boxplot(x12.1)
plot(density(x12.1))
polygon(density(x12.1), col = "green", border = "red")
View(project_data)
##Import project_data again with transformed variables
library(readxl)
project_data <- read_excel("PROJECT/project_data.xlsx", sheet =
"Sheet3")
View(project_data)
##assign column names once again with transformed
variables
y<- project_data$`Life expectancy`
x1<-project_data$Status
x2<-project_data$`Adult Mortality`
x3<-project_data$`infant deaths`
x4<-project_data$`Hepatitis B`
x5<-project_data$Measles
x6<-project_data$BMI
x7<-project_data$`under-five deaths`
x8<-project_data$Polio
x9<-project_data$Diphtheria
x10<-project_data$`HIV/AIDS`
x11<-project_data$GDP
x12<-project_data$Population
x13<-project_data$`thinness 1-19 years`
x14<-project_data$`thinness 5-9 years`
x15<-project_data$`Income composition of resources`
x16<-project_data$Schooling
## fitting model
fitt<-lm(y~x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15+x16)
## forming x matrix
x <-
cbind(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16)
frame <-
data.frame(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16)
## installing corrplot package for dispersion matrix
installed.packages("corrplot")
library(corrplot)
## checking correlation between variables and response
variable
corrplot(cor(cbind(y,frame)),method="number")
## installing car package for VIF
install.packages("car")
library(car)
## checking VIF
vif(fitt)
summary(fitt)
## we observe that adj R sq= 0.885 and Multiple R-squared:
0.8996
## installing package for variance decomposition
install.packages("perturb")
library(perturb)
## checking variance decomposition
colldiag(fitt)
##??16,4(=0.906), ??16,9(=0.933) are large
##the regressor variable with higher VIF value (i.e. x9 ) is
removed and fit model excluding x9
fitt1<-lm(y~x1+x2+x3+x4+x5+x6+x7+x8+x10+x11+x12+x13+x14+x15+x16)
summary(fitt1)
## we observe that adj R sq= 0.8854 and Multiple R-squared:
0.899
## stage 2 checking vif
vif(fitt1)
## Checking vdm at stage 3
colldiag(fitt1)
##??15,3(=0.970), ??15,7(=0.960) are large
##the regressor variable with higher VIF value (i.e. x3 ) is
removed and fit model excluding x9
fitt2<-lm(y~x1+x2+x4+x5+x6+x7+x8+x10+x11+x12+x13+x14+x15+x16)
summary(fitt2)
## we observe that adj R sq= 0.8847 and Multiple R-squared:
0.8975
## stage 2 checking vif
vif(fitt2)
## Checking vdm at stage 3
colldiag(fitt2)
##***??14,(), ??14,() are large
##the regressor variable with higher VIF value (i.e. x13 ) is
removed and fit model excluding x9
fitt3<-lm(y~x1+x2+x4+x5+x6+x7+x8+x10+x11+x12+x14+x15+x16)
summary(fitt3)
## we observe that adj R sq= 0.8854 and Multiple R-squared:
0.8972
## stage 2 checking vif
vif(fitt3)
## we observed that vif are not less than 5 so no dropping of
variable now.
## now checking for heteroscedasticity
plot(fitt3)
res <- residuals(fitt3)
ff <- fitted(fitt3)
cor(res,ff)
plot(res,ff)
##errors are random since correlation b/w res and ff is
approximately zero
## cheecking normality
shapiro.test(res)
## p value(=0.2962) is not less than 0.05 hence normal
## Lack Of fit
install.packages("alr3")
library(alr3)
pureErrorAnova
## Variable Selection..... Akaike Information criteria
install.packages("MASS")
library(MASS)
step(fitt3,direction="both")
install.packages("AICcmodavg")
library(AICcmodavg)
AICc(fitt3)
## AICc=634.9806
## prediction
y_hat<- predict(fitt3)
y_hat
cor(y,y_hat)
plot(y,y_hat)
plot(y_hat)
plot(y)
plot(y,y_hat)
plot(y,y_hat, type="l", col="green", lwd=5, xlab="y",
ylab="y_hat", main="graph between y and y_hat")
lines(y, y_hat, col="red", lwd=2)
lines(y, y_hat, type="b", col="red", lwd=2, pch=19)
#checking normality
options(scipen = 999)
ress<-residuals(fitt3)
ks.test(ress,"pnorm")
## Q-Q Plot
qqnorm(ress)
qqline(ress)
f<- lm(y~ x2+x4+x10+x15)
summary(f)
options(scipen = 999)
r<-residuals(f)
ks.test(r,"pnorm")
hist(r,127)
hist(r)