Question

In: Statistics and Probability

For expert using R I try to solve this question((USING DATA FAITHFUL)) but each time I...

For expert using R

I try to solve this question((USING DATA FAITHFUL)) but each time I solve it, I have error , I try it many times. So,everything you write will be helpful..

Modify the EM-algorithm functions to work for a general K component Gaussian mixtures. Please use this function to fit a K= 1;2;3;4 modelto the old faithful data available in R (You need to initialize the EM-algorithm First ).   

Which modelseems to t the data better? (Hint: use BIC to compare models.)

Here what I try to use

## EM algorithm for univariate normal mixture

# The E-step

E.step <- function(x, pi, Mu, S2){

K <- length(pi)

n <- length(x)

tau <- matrix(rep(NA, n * K), ncol = K)

for (i in 1:n){

for (k in 1:K){

tau[i,k] <- pi[k] * dnorm(x[i], Mu[k], sqrt(S2[k]))

}

tau[i,] <- tau[i,] / sum(tau[i,])

}

return(tau)

}

#The M-step

M.step <- function(x, tau){

n <- length(x)

K <- dim(tau)[2]

tau.sum <- apply(tau, 2, sum)

pi <- tau.sum / n

Mu <- t(tau) %*% x / tau.sum

S2[1] <- t(tau[,1]) %*% (x - Mu[1])^2 / tau.sum[1]

S2[2] <- t(tau[,2]) %*% (x - Mu[2])^2 / tau.sum[2]

return(list(pi = pi, Mu = Mu, S2 = S2))

}

## The log-likelihood function

logL <- function(x, pi, Mu, S2){

n <- length(x)

ll <- 0

for (i in 1:n){

ll <- ll + log(pi[1] * dnorm(x[i], Mu[1], sqrt(S2[1])) +

pi[2] * dnorm(x[i], Mu[2], sqrt(S2[2])))

}

return(ll)

}

## The algorithm

EM <- function(x, pi, Mu, S2, tol){

t <- 0

ll.old <- -Inf

ll <- logL(x, pi, Mu, S2)

repeat{

t <- t + 1

if ((ll - ll.old) / abs(ll) < tol) break

ll.old <- ll

tau <- E.step(x, pi, Mu, S2)

M <- M.step(x, tau)

pi <- M$pi

Mu <- M$Mu

S2 <- M$S2

ll <- logL(x, M$pi, M$Mu, M$S2)

cat("Iteration", t, "logL =", ll, " ")

}

return(list(pi = M$pi, Mu = M$Mu, S2 = M$S2, tau = tau, logL = ll))

}

## generate data

set.seed(1)

pi <- c(0.3, 0.7)

Mu <- c(5, 10)

S2 <- c(1, 1)

n <- 1000

n1 <- rbinom(1, n, pi[1])

n2 <- n - n1

x1 <- rnorm(n1, Mu[1], sqrt(S2[1]))

x2 <- rnorm(n2, Mu[2], sqrt(S2[2]))

x <- c(x1, x2)

hist(x, freq = FALSE, ylim = c(0, 0.2))

# pick initial values

pi.init <- c(0.5, 0.5)

Mu.init <- c(3, 10)

S2.init <- c(0.4, 2)

#Run EM

A <- EM(x, pi.init, Mu.init, S2.init, tol = 10^-6)

#plot

t <- seq(0, 15, by = 0.01)

y <- pi[1] * dnorm(t, Mu[1], sqrt(S2[1])) +

pi[2] * dnorm(t, Mu[2], sqrt(S2[2]))

y.est <- A$pi[1] * dnorm(t, A$Mu[1], sqrt(A$S2[1])) +

A$pi[2] * dnorm(t, A$Mu[2], sqrt(A$S2[2]))

points(t, y, type = "l")

points(t, y.est, type = "l", col = 2, lty = 2)

# assign observations to components - clustering

d <- function(x) which(x == max(x))

apply(A$tau, 1, d)

apply(A$tau, 1, which.max)

# assess misclassification

table(apply(A$tau, 1, which.max), c(rep(1, n1), rep(2, n2)))

Solutions

Expert Solution

there is nothing wrong in your code. you just missed a space in the last line of the code.

table(apply(A $tau, 1, which.max), c(rep(1, n1), rep(2, n2)))

make sure that you use R programming version 3.5.1

the table obtained at the last step is

1 2
1 296 3
2 5 696

execute with this code

## EM algorithm for univariate normal mixture

# The E-step

E.step <- function(x, pi, Mu, S2){

K <- length(pi)

n <- length(x)

tau <- matrix(rep(NA, n * K), ncol = K)

for (i in 1:n){

for (k in 1:K){

tau[i,k] <- pi[k] * dnorm(x[i], Mu[k], sqrt(S2[k]))

}

tau[i,] <- tau[i,] / sum(tau[i,])

}

return(tau)

}

#The M-step

M.step <- function(x, tau){

n <- length(x)

K <- dim(tau)[2]

tau.sum <- apply(tau, 2, sum)

pi <- tau.sum / n

Mu <- t(tau) %*% x / tau.sum

S2[1] <- t(tau[,1]) %*% (x - Mu[1])^2 / tau.sum[1]

S2[2] <- t(tau[,2]) %*% (x - Mu[2])^2 / tau.sum[2]

return(list(pi = pi, Mu = Mu, S2 = S2))

}

## The log-likelihood function

logL <- function(x, pi, Mu, S2){

n <- length(x)

ll <- 0

for (i in 1:n){

ll <- ll + log(pi[1] * dnorm(x[i], Mu[1], sqrt(S2[1])) +

pi[2] * dnorm(x[i], Mu[2], sqrt(S2[2])))

}

return(ll)

}

## The algorithm

EM <- function(x, pi, Mu, S2, tol){

t <- 0

ll.old <- -Inf

ll <- logL(x, pi, Mu, S2)

repeat{

t <- t + 1

if ((ll - ll.old) / abs(ll) < tol) break

ll.old <- ll

tau <- E.step(x, pi, Mu, S2)

M <- M.step(x, tau)

pi <- M$pi

Mu <- M$Mu

S2 <- M$S2

ll <- logL(x, M$pi, M$Mu, M$S2)

cat("Iteration", t, "logL =", ll, " ")

}

return(list(pi = M$pi, Mu = M$Mu, S2 = M$S2, tau = tau, logL = ll))

}

## generate data

set.seed(1)

pi <- c(0.3, 0.7)

Mu <- c(5, 10)

S2 <- c(1, 1)

n <- 1000

n1 <- rbinom(1, n, pi[1])

n2 <- n - n1

x1 <- rnorm(n1, Mu[1], sqrt(S2[1]))

x2 <- rnorm(n2, Mu[2], sqrt(S2[2]))

x <- c(x1, x2)

hist(x, freq = FALSE, ylim = c(0, 0.2))

# pick initial values

pi.init <- c(0.5, 0.5)

Mu.init <- c(3, 10)

S2.init <- c(0.4, 2)

#Run EM

A <- EM(x, pi.init, Mu.init, S2.init, tol = 10^-6)

#plot

t <- seq(0, 15, by = 0.01)

y <- pi[1] * dnorm(t, Mu[1], sqrt(S2[1])) +

pi[2] * dnorm(t, Mu[2], sqrt(S2[2]))

y.est <- A$pi[1] * dnorm(t, A$Mu[1], sqrt(A$S2[1])) +

A$pi[2] * dnorm(t, A$Mu[2], sqrt(A$S2[2]))

points(t, y, type = "l")

points(t, y.est, type = "l", col = 2, lty = 2)

# assign observations to components - clustering

d <- function(x) which(x == max(x))

apply(A$tau, 1, d)

apply(A$tau, 1, which.max)

# assess misclassification

table(apply(A $tau, 1, which.max), c(rep(1, n1), rep(2, n2)))


Related Solutions

Use y=faithful$eruptions; x=faithful$waiting to set the eruption durations and waiting time between eruptions of the R...
Use y=faithful$eruptions; x=faithful$waiting to set the eruption durations and waiting time between eruptions of the R data set faithful in objects y and x, respectively, and complete the following parts. 1. Make an scatterplot of the (x,y) data. Does it support the assumption that the data follows the simple linear regression model? (Include the plot with your answer.) 2. Fit the simple linear regression model and construct a 90% CI for: a) the slope, and b) the mean eruption duration...
I am using Excel to try and solve these problems but can find the answers or...
I am using Excel to try and solve these problems but can find the answers or figure out why my answers are wrong, please help. 1. Today, your roommate purchased an annual perpetuity of $1,000. The payments on his perpetuity will begin in one year. You purchased an annual perpetuity of $1,000. Your payments begin immediately. Assuming a 10% discount rate for these perpetuities, which of the following statements is true? A. Your perpetuity is worth $1,000 more than the...
Please solve all of the question using R and do clarify the answers. Using the (SATGPA)...
Please solve all of the question using R and do clarify the answers. Using the (SATGPA) data set in (Stat2Data) package. Test by using ?= .01. 1) Create the following three variables and then print out all the six variables. A) Create new variable "SAT", which is the sum of (MathSAT) and (VerbalSAT). B) Create second new variable ("SATLevel"), and assign the value of( "SATLevel") as 1 when SAT<=1100, 2 when 1100<SAT<=1200, 3 when 1200<SAT<=1300, and 4 when SAT>1300. C)Create...
A simple Statistic question by using R, If I have two set of mean proportion data,...
A simple Statistic question by using R, If I have two set of mean proportion data, what test should I use? such as, [1] 0.7652632 0.7555354 0.7602588 0.7594096 0.7497992 0.5532588 0.7595661 0.6911504 [9] 0.5964602 0.6369565 0.7355828 0.7346225 0.5913793 0.6499079 0.6327273 0.6091873 [17] 0.6306122 0.5960784 0.5492918 0.6785714 0.5014787 0.5484848 0.5645403 0.6731343 [25] 0.6208191 0.6087248 0.6045045 0.7743390 0.5275862 0.5731278 [1] 0.6564195 0.5928482 0.6806709 0.5546422 0.5438393 0.5906535 0.6764637 0.6487188 [9] 0.5901547 0.6626735 0.5955325 0.7462415 0.5971111 0.5731504 0.6334729 0.6124653 [17] 0.6224686 0.5549067 0.6348427 0.6265627...
Answer the following questions using R (include both the input and output for each question). I)...
Answer the following questions using R (include both the input and output for each question). I) A random variable P has a Poisson distribution with a mean of 10. Solve for the probability that random variable P is greater than 8. II) What is the probability that in 30 tosses of a fair coin, the head comes up 10 or 15 times? III) What is the probability that a normal random variable is less than 40, assuming that the variable...
Answer the following questions using R (include both the input and output for each question). I)...
Answer the following questions using R (include both the input and output for each question). I) A random variable P has a Poisson distribution with a mean of 10. Solve for the probability that random variable P is greater than 8. II) What is the probability that in 30 tosses of a fair coin, the head comes up 10 or 15 times? III) What is the probability that a normal random variable is less than 40, assuming that it has...
The Book of R (Question 20.2) Please answer using R code. Continue using the survey data...
The Book of R (Question 20.2) Please answer using R code. Continue using the survey data frame from the package MASS for the next few exercises. The survey data set has a variable named Exer , a factor with k = 3 levels describing the amount of physical exercise time each student gets: none, some, or frequent. Obtain a count of the number of students in each category and produce side-by-side boxplots of student height split by exercise. Assuming independence...
Using the BAII Plus calculator I am having a difficult time using this calculator to solve...
Using the BAII Plus calculator I am having a difficult time using this calculator to solve this MIRR 0 =-325,000 1=50,000 2= 75,000 3=-60,000 4=225,000 5=300,000 Required rate of return =15% We discount all negative CFs(at 15%) time 0 We compound all positive cash flows (at 15%) to 5 years This is now the TV or Terminal Value Please help me understand how to calculate these numbers?
I am using the phbirths data in the faraway package in R. I want to: 1)...
I am using the phbirths data in the faraway package in R. I want to: 1) create a plot of the birth weight vs the gestational age and I want to colour code the points based on the mothers smoking status to determine whether or not smoking affects the babies. 2) fit a simple model (one regression line) along with both the main effects (parallel lines) and interaction (non parallel lines) ANCOVA model to the data and find out which...
How do I even begin to solve this using R statistical software? A random sample of...
How do I even begin to solve this using R statistical software? A random sample of eight pairs of twins was randomly assigned to treatment A or treatment B. The data are given in the following table: Twins 1 2 3 4 5 6 7 8 Treatment A 48.3 44.6 49.7 40.5 54.3 55.6 45.8 35.4 Treatment B 43.5 43.8 53.7 43.9 54.4 54.7 45.2 34.4 What is the p-value of the Wilcoxon signed-rank test? Is there any significant evidence...
ADVERTISEMENT
ADVERTISEMENT
ADVERTISEMENT