# Estimating the parameters of mixture distributions # Example 1: assume the data come from a mixture of 2 normal distributions and estimate the parameters of the 2 distributions as well as their weights # -- simulate data w <- c(0.22, 0.78) mu <- c(-1, 3) sigma <- c(0.3, 1) # -- we generate 100 observations from the mixture distribution: n.obs <- 100 y <- vector(length=n.obs) str(y) for (i in 1:n.obs) { sample.draw <- t(rmultinom(1, size=1, prob=w)) y[i] <- rnorm(1, as.numeric(sample.draw %*% mu), as.numeric(sample.draw %*% sigma)) } # -- take a look at the data hist(y, col="lightblue", freq=FALSE, breaks=30, ylim=range(0, 0.45)) lines(density(y), col="blue", lwd=2) temp <- seq(min(y), max(y), length.out=1000) lines(temp, (w[1]*dnorm(temp, mu[1], sigma[1]))+(w[2]*dnorm(temp, mu[2], sigma[2])), col="black", lty=2, lwd=2) # the theoretical density # Now we estimate the parameters of the mixture (some the following material is based on ch. 15 of "Bayesian Ideas and Data Analysis", R. Christensen et al, CRC Press 2011): # -- we will need priors for the weights -- the Dirichlet family of distributions, which is a generalization of the Beta family # -- the Dirichlet is for the multinomial what the Beta is for the binomial #install.packages("MCMCpack") library("MCMCpack") # -- we can have uniform priors for multinomials of any finite dimension rdirichlet(10, c(1, 1)) apply(rdirichlet(10, c(1, 1)), 1, sum) rdirichlet(10, c(1, 1, 1)) apply(rdirichlet(10, c(1, 1, 1)), 1, sum) rdirichlet(10, c(1, 1, 1, 1)) apply(rdirichlet(10, c(1, 1, 1, 1)), 1, sum) # -- but the Dirichlet family is very flexible, just like the Beta family rdirichlet(10, c(1, 10)) rdirichlet(10, c(10, 1)) rdirichlet(10, c(5, 5)) rdirichlet(10, c(50, 50)) rdirichlet(10, c(0.5, 0.5)) # Plots # -- a uniform univariate Dirichlet (with 2 components); the second column of draws is a deterministic function of the first alpha <- c(1, 1) temp <- rdirichlet(10000, alpha) par(mfrow=c(1, length(alpha))) for (i in 1:length(alpha)) { hist(temp[, i], freq=F, col="lightblue", breaks=30, xlab="", main=paste("Histogram of component", i)) lines(density(temp[, i]), col="blue", lwd=2) } par(mfrow=c(1, 1)) # -- a uniform bivariate Dirichlet (with 3 components); the third column of draws is a deterministic function of the other two alpha <- c(1, 1, 1) temp <- rdirichlet(10000, alpha) par(mfrow=c(1, length(alpha))) for (i in 1:length(alpha)) { hist(temp[, i], freq=F, col="lightblue", breaks=30, xlab="", main=paste("Histogram of component", i)) lines(density(temp[, i]), col="blue", lwd=2) } par(mfrow=c(1, 1)) library("MASS") par(mfrow=c(3, 2)) kde.12 <- kde2d(temp[, 1], temp[, 2], n=50) image(kde.12); contour(kde.12, add=T) persp(kde.12, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") kde.13 <- kde2d(temp[, 1], temp[, 3], n=50) image(kde.13); contour(kde.13, add=T) persp(kde.13, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") kde.23 <- kde2d(temp[, 2], temp[, 3], n=50) image(kde.23); contour(kde.23, add=T) persp(kde.23, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") par(mfrow=c(1, 1)) # -- more Dirichlet distributions with 3 components alpha <- c(10, 10, 10) temp <- rdirichlet(10000, alpha) par(mfrow=c(1, length(alpha))) for (i in 1:length(alpha)) { hist(temp[, i], freq=F, col="lightblue", breaks=30, xlab="", main=paste("Histogram of component", i)) lines(density(temp[, i]), col="blue", lwd=2) } par(mfrow=c(1, 1)) library("MASS") par(mfrow=c(3, 2)) kde.12 <- kde2d(temp[, 1], temp[, 2], n=50) image(kde.12); contour(kde.12, add=T) persp(kde.12, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") kde.13 <- kde2d(temp[, 1], temp[, 3], n=50) image(kde.13); contour(kde.13, add=T) persp(kde.13, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") kde.23 <- kde2d(temp[, 2], temp[, 3], n=50) image(kde.23); contour(kde.23, add=T) persp(kde.23, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") par(mfrow=c(1, 1)) ## alpha <- c(0.5, 0.5, 0.5) temp <- rdirichlet(10000, alpha) par(mfrow=c(1, length(alpha))) for (i in 1:length(alpha)) { hist(temp[, i], freq=F, col="lightblue", breaks=30, xlab="", main=paste("Histogram of component", i)) lines(density(temp[, i]), col="blue", lwd=2) } par(mfrow=c(1, 1)) library("MASS") par(mfrow=c(3, 2)) kde.12 <- kde2d(temp[, 1], temp[, 2], n=50) image(kde.12); contour(kde.12, add=T) persp(kde.12, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") kde.13 <- kde2d(temp[, 1], temp[, 3], n=50) image(kde.13); contour(kde.13, add=T) persp(kde.13, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") kde.23 <- kde2d(temp[, 2], temp[, 3], n=50) image(kde.23); contour(kde.23, add=T) persp(kde.23, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") par(mfrow=c(1, 1)) ## alpha <- c(10, 20, 30) temp <- rdirichlet(10000, alpha) par(mfrow=c(1, length(alpha))) for (i in 1:length(alpha)) { hist(temp[, i], freq=F, col="lightblue", breaks=30, xlab="", main=paste("Histogram of component", i)) lines(density(temp[, i]), col="blue", lwd=2) } par(mfrow=c(1, 1)) library("MASS") par(mfrow=c(3, 2)) kde.12 <- kde2d(temp[, 1], temp[, 2], n=50) image(kde.12); contour(kde.12, add=T) persp(kde.12, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") kde.13 <- kde2d(temp[, 1], temp[, 3], n=50) image(kde.13); contour(kde.13, add=T) persp(kde.13, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") kde.23 <- kde2d(temp[, 2], temp[, 3], n=50) image(kde.23); contour(kde.23, add=T) persp(kde.23, phi=50, theta=45, border=NULL, ticktype="detailed", xlab="", ylab="", zlab="") par(mfrow=c(1, 1)) # Back to the estimation of the mixture # -- the data again w <- c(0.22, 0.78) mu <- c(-1, 3) sigma <- c(0.3, 1) n.obs <- 100 y <- vector(length=n.obs) for (i in 1:n.obs) { sample.draw <- t(rmultinom(1, size=1, prob=w)) y[i] <- rnorm(1, as.numeric(sample.draw %*% mu), as.numeric(sample.draw %*% sigma)) } hist(y, col="lightblue", freq=FALSE, breaks=30, ylim=range(0, 0.45)) lines(density(y), col="blue", lwd=2) temp <- seq(min(y), max(y), length.out=1000) lines(temp, (w[1]*dnorm(temp, mu[1], sigma[1]))+(w[2]*dnorm(temp, mu[2], sigma[2])), col="black", lty=2, lwd=2) # The mixture model cat("model{ # Likelihood for (i in 1:n) { y[i] ~ dnorm(mu[z[i]], tau[z[i]]) # each observation y[i] comes from the corresponding normal distribution z[i] } for (i in 1:n) { z[i] ~ dcat(weight[1:2]) # the component distributions are chosen given the 2 weights } # Priors alpha[1] <- 1 alpha[2] <- 1 weight[1:2] ~ ddirch(alpha[1:2]) mu[1] ~ dnorm(0, 0.001) delta.ini ~ dnorm(0, 0.001) delta <- abs(delta.ini) mu[2] <- mu[1] + delta for (i in 1:2) { sigma[i] ~ dunif(0, 10) tau[i] <- 1/(sigma[i]*sigma[i]) } }", fill=TRUE, file="mixture_1.txt") # Assemble data jags.data <- list(n=as.numeric(length(y)), y=as.numeric(y)) # Inits function inits <- function() {list(mu=c(0, NA)+rnorm(2, 0, 0.1), weight=c(0.5, 0.5)+rnorm(2, 0, 0.01), delta.ini=rnorm(1, 0, 0.5), sigma=c(1, 1)+rnorm(2, 0, 0.1))} # Parameters to estimate params <- c("weight", "mu", "sigma") # MCMC settings nc <- 3 # Number of chains ni <- 10000 # Number of draws from posterior for each chain nb <- 5000 # Number of draws to discard as burn-in nthin <- 10 # Thinning rate # Load R2jags library("R2jags") # Call JAGS outj <- jags(jags.data, inits=inits, parameters.to.save=params, model.file="mixture_1.txt", n.thin=nthin, n.chains=nc, n.burnin=nb, n.iter=ni) print(outj,dig=3) traceplot(outj) # compare with true values samples <- outj$BUGSoutput print(samples$mean, dig=3) mu sigma w # -- we can also estimate the probability that each observation comes from a particular component, although this is a bit less stable numerically params <- c("weight", "mu", "sigma", "z") outj <- jags(jags.data, inits=inits, parameters.to.save=params, model.file="mixture_1.txt", n.thin=nthin, n.chains=nc, n.burnin=nb, n.iter=ni) samples <- outj$BUGSoutput print(samples$mean$mu, dig=3) mu print(samples$mean$sigma, dig=3) sigma print(samples$mean$weight, dig=3) w plot(samples$mean$mu, 1:2, xlim=range(y), col="red", pch=19, cex=3, xlab="Observations y", ylab="Mean of the classification of y (groups 1 vs. 2)") for (i in 1:2) { segments(samples$mean$mu[i]-2*samples$mean$sigma[i], i, samples$mean$mu[i]+2*samples$mean$sigma[i], i, col="red", lwd=7.5) } points(y, samples$mean$z, pch="+", col="blue", cex=1.3) # Example 2: assume the data come from a mixture of 3 normal distributions and estimate the parameters of the 3 distributions as well as their weights # -- simulate data w <- c(0.2, 0.3, 0.5) mu <- c(-2, 1, 4) sigma <- c(1, 0.5, 0.75) # -- we generate 100 observations from the mixture distribution: n.obs <- 100 y <- vector(length=n.obs) str(y) for (i in 1:n.obs) { sample.draw <- t(rmultinom(1, size=1, prob=w)) y[i] <- rnorm(1, as.numeric(sample.draw %*% mu), as.numeric(sample.draw %*% sigma)) } # -- take a look at the data hist(y, col="lightblue", freq=FALSE, breaks=30, ylim=range(0, 0.45)) lines(density(y), col="blue", lwd=2) temp <- seq(min(y), max(y), length.out=1000) lines(temp, (w[1]*dnorm(temp, mu[1], sigma[1]))+(w[2]*dnorm(temp, mu[2], sigma[2]))+(w[3]*dnorm(temp, mu[3], sigma[3])), col="black", lty=2, lwd=2) # the theoretical density # The mixture model cat("model{ # Likelihood for (i in 1:n) { y[i] ~ dnorm(mu[z[i]], tau[z[i]]) } for (i in 1:n) { z[i] ~ dcat(weight[1:3]) } # Priors alpha[1] <- 1 alpha[2] <- 1 alpha[3] <- 1 weight[1:3] ~ ddirch(alpha[1:3]) mu[1] ~ dnorm(0, 0.001) for (i in 1:2) { delta.ini[i] ~ dnorm(0, 0.001) delta[i] <- abs(delta.ini[i]) } mu[2] <- mu[1] + delta[1] mu[3] <- mu[2] + delta[2] for (i in 1:3) { sigma[i] ~ dunif(0, 10) tau[i] <- 1/(sigma[i]*sigma[i]) } }", fill=TRUE, file="mixture_2.txt") # Assemble data jags.data <- list(n=as.numeric(length(y)), y=as.numeric(y)) # Inits function inits <- function() {list(mu=c(0, NA, NA)+rnorm(3, 0, 0.1), weight=c(1/3, 1/3, 1/3)+rnorm(3, 0, 0.01), delta.ini=rnorm(2, 0, 0.5), sigma=c(1, 1, 1)+rnorm(3, 0, 0.1))} # Parameters to estimate params <- c("weight", "mu", "sigma", "z") # MCMC settings nc <- 3 # Number of chains ni <- 10000 # Number of draws from posterior for each chain nb <- 5000 # Number of draws to discard as burn-in nthin <- 10 # Thinning rate # Load R2jags library("R2jags") # Call JAGS outj <- jags(jags.data, inits=inits, parameters.to.save=params, model.file="mixture_2.txt", n.thin=nthin, n.chains=nc, n.burnin=nb, n.iter=ni) print(outj,dig=3) traceplot(outj) samples <- outj$BUGSoutput # compare with true values print(samples$mean$mu, dig=3) mu print(samples$mean$sigma, dig=3) sigma print(samples$mean$weight, dig=3) w plot(samples$mean$mu, 1:3, xlim=range(y), col="red", pch=19, cex=3, xlab="Observations y", ylab="Mean of the classification of y (groups 1 vs. 2 vs. 3)") for (i in 1:3) { segments(samples$mean$mu[i]-2*samples$mean$sigma[i], i, samples$mean$mu[i]+2*samples$mean$sigma[i], i, col="red", lwd=7.5) } points(y, samples$mean$z, pch="+", col="blue", cex=1.3) # What if we try to capture a mixture of 3 normals by means of only 2 normals? # -- we'll probably be able to see the lack of fit when we plot the observations y against their estimated mean classification # -- the DIC will probably be bigger cat("model{ # Likelihood for (i in 1:n) { y[i] ~ dnorm(mu[z[i]], tau[z[i]]) } for (i in 1:n) { z[i] ~ dcat(weight[1:2]) } # Priors alpha[1] <- 1 alpha[2] <- 1 #alpha[3] <- 1 weight[1:2] ~ ddirch(alpha[1:2]) mu[1] ~ dnorm(0, 0.001) for (i in 1:1) { delta.ini[i] ~ dnorm(0, 0.001) delta[i] <- abs(delta.ini[i]) } mu[2] <- mu[1] + delta[1] #mu[3] <- mu[2] + delta[2] for (i in 1:2) { sigma[i] ~ dunif(0, 10) tau[i] <- 1/(sigma[i]*sigma[i]) } }", fill=TRUE, file="mixture_3.txt") inits <- function() {list(mu=c(0, NA)+rnorm(2, 0, 0.1), weight=c(1/2, 1/2)+rnorm(2, 0, 0.01), delta.ini=rnorm(1, 0, 0.5), sigma=c(1, 1)+rnorm(2, 0, 0.1))} # Load R2jags library("R2jags") # Call JAGS outj <- jags(jags.data, inits=inits, parameters.to.save=params, model.file="mixture_3.txt", n.thin=nthin, n.chains=nc, n.burnin=nb, n.iter=ni) print(outj,dig=3) traceplot(outj) samples <- outj$BUGSoutput # compare with true values print(samples$mean$mu, dig=3) mu print(samples$mean$sigma, dig=3) sigma print(samples$mean$weight, dig=3) w samples$mean$deviance samples$DIC plot(samples$mean$mu, 1:2, xlim=range(y), col="red", pch=19, cex=3, xlab="Observations y", ylab="Mean of the classification of y (groups 1 vs. 2)") for (i in 1:2) { segments(samples$mean$mu[i]-2*samples$mean$sigma[i], i, samples$mean$mu[i]+2*samples$mean$sigma[i], i, col="red", lwd=7.5) } points(y, samples$mean$z, pch="+", col="blue", cex=1.3) # What if we try to capture a mixture of 3 normals by means of 4 normals? # -- we'll probably be able to see the lack of fit when we plot the observations y against their estimated mean classification cat("model{ # Likelihood for (i in 1:n) { y[i] ~ dnorm(mu[z[i]], tau[z[i]]) } for (i in 1:n) { z[i] ~ dcat(weight[1:4]) } # Priors alpha[1] <- 1 alpha[2] <- 1 alpha[3] <- 1 alpha[4] <- 1 weight[1:4] ~ ddirch(alpha[1:4]) mu[1] ~ dnorm(0, 0.001) for (i in 1:3) { delta.ini[i] ~ dnorm(0, 0.001) delta[i] <- abs(delta.ini[i]) } mu[2] <- mu[1] + delta[1] mu[3] <- mu[2] + delta[2] mu[4] <- mu[3] + delta[3] for (i in 1:4) { sigma[i] ~ dunif(0, 10) tau[i] <- 1/(sigma[i]*sigma[i]) } }", fill=TRUE, file="mixture_4.txt") inits <- function() {list(mu=c(0, NA, NA, NA)+rnorm(4, 0, 0.1), weight=c(1/4, 1/4, 1/4, 1/4)+rnorm(4, 0, 0.01), delta.ini=rnorm(3, 0, 0.5), sigma=c(1, 1, 1, 1)+rnorm(4, 0, 0.05))} # MCMC settings nc <- 3 # Number of chains ni <- 15000 # Number of draws from posterior for each chain nb <- 10000 # Number of draws to discard as burn-in nthin <- 10 # Thinning rate # Load R2jags library("R2jags") # Call JAGS outj <- jags(jags.data, inits=inits, parameters.to.save=params, model.file="mixture_4.txt", n.thin=nthin, n.chains=nc, n.burnin=nb, n.iter=ni) print(outj, dig=3) traceplot(outj) samples <- outj$BUGSoutput # compare with true values print(samples$mean$mu, dig=3) mu print(samples$mean$sigma, dig=3) sigma print(samples$mean$weight, dig=3) w samples$mean$deviance samples$DIC plot(samples$mean$mu, 1:4, xlim=range(y), col="red", pch=19, cex=3, xlab="Observations y", ylab="Mean of the classification of y (groups 1 vs. 2 vs. 3 vs. 4)") for (i in 1:4) { segments(samples$mean$mu[i]-2*samples$mean$sigma[i], i, samples$mean$mu[i]+2*samples$mean$sigma[i], i, col="red", lwd=7.5) } points(y, samples$mean$z, pch="+", col="blue", cex=1.3) # Example 4: Dirichlet Process Mixture (DPM) estimation # the material from here on is based on the "Flexible random-effects models ..." article by Ohlssen, Sharples & Spiegelhalter (Statistics in Medicine, 2007, pp. 2088-2112) in addition to ch. 15 of "Bayesian Ideas and Data Analysis" # -- we simulate data from a mixture of 3 normal distributions, but with very small sds to make the clusters very clear w <- c(0.2, 0.3, 0.5) mu <- c(-2, 1, 4) sigma <- c(0.1, 0.15, 0.2) # -- we generate 100 observations from the mixture distribution: n.obs <- 100 y <- vector(length=n.obs) for (i in 1:n.obs) { sample.draw <- t(rmultinom(1, size=1, prob=w)) y[i] <- rnorm(1, as.numeric(sample.draw %*% mu), as.numeric(sample.draw %*% sigma)) } hist(y, col="lightblue", freq=FALSE, breaks=30, ylim=range(0, 1)) lines(density(y), col="blue", lwd=2) temp <- seq(min(y), max(y), length.out=1000) lines(temp, (w[1]*dnorm(temp, mu[1], sigma[1]))+(w[2]*dnorm(temp, mu[2], sigma[2]))+(w[3]*dnorm(temp, mu[3], sigma[3])), col="black", lty=2, lwd=2) # Let's first try the model with a prespecified number of components, i.e., the 3-normal mixture model cat("model{ # Likelihood for (i in 1:n) { y[i] ~ dnorm(mu[z[i]], tau[z[i]]) } for (i in 1:n) { z[i] ~ dcat(weight[1:3]) } # Priors alpha[1] <- 1 alpha[2] <- 1 alpha[3] <- 1 weight[1:3] ~ ddirch(alpha[1:3]) mu[1] ~ dnorm(0, 0.001) for (i in 1:2) { delta.ini[i] ~ dnorm(0, 0.001) delta[i] <- abs(delta.ini[i]) } mu[2] <- mu[1] + delta[1] mu[3] <- mu[2] + delta[2] for (i in 1:3) { sigma[i] ~ dunif(0, 10) tau[i] <- 1/(sigma[i]*sigma[i]) } }", fill=TRUE, file="mixture_6.txt") # Assemble data jags.data <- list(n=as.numeric(length(y)), y=as.numeric(y)) # Inits function inits <- function() {list(mu=c(0, NA, NA)+rnorm(3, 0, 0.1), weight=c(1/3, 1/3, 1/3)+rnorm(3, 0, 0.01), delta.ini=rnorm(2, 0, 0.5), sigma=c(1, 1, 1)+rnorm(3, 0, 0.1))} # Parameters to estimate params <- c("weight", "mu", "sigma", "z") # MCMC settings nc <- 3 # Number of chains ni <- 10000 # Number of draws from posterior for each chain nb <- 5000 # Number of draws to discard as burn-in nthin <- 10 # Thinning rate # Load R2jags library("R2jags") # Call JAGS outj <- jags(jags.data, inits=inits, parameters.to.save=params, model.file="mixture_6.txt", n.thin=nthin, n.chains=nc, n.burnin=nb, n.iter=ni) print(outj,dig=3) traceplot(outj) samples <- outj$BUGSoutput # compare with true values print(samples$mean$mu, dig=3) mu print(samples$mean$sigma, dig=3) sigma print(samples$mean$weight, dig=3) w plot(samples$mean$mu, 1:3, xlim=range(y), col="red", pch=19, cex=3, xlab="Observations y", ylab="Mean of the classification of y (groups 1 vs. 2 vs. 3)") for (i in 1:3) { segments(samples$mean$mu[i]-2*samples$mean$sigma[i], i, samples$mean$mu[i]+2*samples$mean$sigma[i], i, col="red", lwd=7.5) } points(y, samples$mean$z, pch="+", col="blue", cex=1.3) # -- now let's try the DP model cat("model{ # Likelihood: each observation y[i] comes from a particular cluster z[i] for (i in 1:n) { y[i] ~ dnorm(cluster.mean[z[i]], cluster.tau[z[i]]) z[i] ~ dcat(weight[1:n]) } # There are maximally as many clusters as there are observations, i.e., 100 in this case; the actual number of clusters will be much less #The number of clusters and weights for each cluster are determined by a stick-breaking prior; there are maximally as many clusters as there are points, no need to allow for more clusters than this, so we truncate the DP at n points # We first generate n-1 proportions to break the (remaining) stick for (j in 1:(n-1)) { q[j] ~ dbeta(1, alpha) } # Then we generate the weights by breaking the stick with these proportions weight[1] <- q[1] for (j in 2:(n-1)) { weight[j] <- q[j] * (1-q[j-1]) * weight[j-1] / q[j-1] } weight.sum <- sum(weight[1:(n-1)]) weight[n] <- 1-weight.sum # Priors # The baseline distribution for the centers of the clusters is normal centered at 0 and with a fairly large sd of 10 so that we can accommodate the entire range of the data; we could also put a vague, e.g., dunif(0, 30), hyperprior on the sd, but we don't do that now # The sds of all clusters come from the same fairly vague hyperprior dunif(0, 10) for (j in 1:n) { cluster.mean[j] ~ dnorm(mu0, tau0) cluster.tau[j] <- 1/(cluster.sigma[j]*cluster.sigma[j]) cluster.sigma[j] ~ dunif(0, 10) } mu0 <- 0 tau0 <- 1/(sigma0*sigma0) sigma0 <- 10 # Parameter over the precision of the DP alpha ~ dunif(0.3, 7) # We calculate summary statistics # indicate for each observation i if if comes from (possible) cluster j for (i in 1:n) { for (j in 1:n) { obs.by.cluster[i,j] <- equals(z[i], j) } } # calculate total number of active clusters j out of n possible clusters for (j in 1:n) { active[j] <- step(sum(obs.by.cluster[,j])-1) } n.cluster <- sum(active[]) }", fill=TRUE, file="mixture_7.txt") # Assemble data jags.data <- list(n=as.numeric(length(y)), y=as.numeric(y)) # Inits function inits <- function() {list(alpha=1+rnorm(1, 0, 0.1), cluster.sigma=rep(1, length(y))+rnorm(1, 0, 0.1))} # Parameters to estimate params <- c("cluster.mean", "cluster.sigma", "weight", "alpha", "z", "n.cluster") # MCMC settings nc <- 3 # Number of chains ni <- 10000 # Number of draws from posterior for each chain nb <- 5000 # Number of draws to discard as burn-in nthin <- 10 # Thinning rate # Load R2jags library("R2jags") # Call JAGS outj <- jags(jags.data, inits=inits, parameters.to.save=params, model.file="mixture_7.txt", n.thin=nthin, n.chains=nc, n.burnin=nb, n.iter=ni) print(outj,dig=3) traceplot(outj) samples <- outj$BUGSoutput print(samples$mean,dig=3) # The final version of the model: We add weight sorting. # -- we assign decreasing weights to clusters 1 through n; this makes the model identifiable and a lot easier to inspect the posterior distribution cat("model{ for (i in 1:n) { y[i] ~ dnorm(cluster.mean[z[i]], cluster.tau[z[i]]) z[i] ~ dcat(weight[1:n]) } # We order the weights in this case from the largest to the smallest for identifiability for (j in 1:(n-1)) { q[j] ~ dbeta(1, alpha) } weight.ini[1] <- q[1] for (j in 2:(n-1)) { weight.ini[j] <- q[j] * (1-q[j-1]) * weight.ini[j-1] / q[j-1] } weight.ini.sum <- sum(weight.ini[1:(n-1)]) weight.ini[n] <- 1-weight.ini.sum sorted.weights <- sort(weight.ini[1:n]) for (j in 1:n) { weight[j] <- sorted.weights[n-j+1] } # Priors for (j in 1:n) { cluster.mean[j] ~ dnorm(mu0, tau0) cluster.tau[j] <- 1/(cluster.sigma[j]*cluster.sigma[j]) cluster.sigma[j] ~ dunif(0, 10) } mu0 <- 0 tau0 <- 1/(sigma0*sigma0) sigma0 <- 10 alpha ~ dunif(0.3, 7) # Summary statistics for (i in 1:n) { for (j in 1:n) { obs.by.cluster[i,j] <- equals(z[i], j) } } for (j in 1:n) { active[j] <- step(sum(obs.by.cluster[,j])-1) } n.cluster <- sum(active[]) }", fill=TRUE, file="mixture_8.txt") # Generate data w <- rev(c(0.2, 0.3, 0.5)) mu <- rev(c(-2, 1, 4)) sigma <- rev(c(0.1, 0.15, 0.2)) n.obs <- 100 y <- vector(length=n.obs) for (i in 1:n.obs) { sample.draw <- t(rmultinom(1, size=1, prob=w)) y[i] <- rnorm(1, as.numeric(sample.draw %*% mu), as.numeric(sample.draw %*% sigma)) } # Look at the data hist(y, col="lightblue", freq=FALSE, breaks=30, ylim=range(0, 1)) lines(density(y), col="blue", lwd=2) temp <- seq(min(y), max(y), length.out=1000) lines(temp, (w[1]*dnorm(temp, mu[1], sigma[1]))+(w[2]*dnorm(temp, mu[2], sigma[2]))+(w[3]*dnorm(temp, mu[3], sigma[3])), col="black", lty=2, lwd=2) # Assemble data jags.data <- list(n=as.numeric(length(y)), y=as.numeric(y)) # Inits function inits <- function() {list(alpha=1+rnorm(1, 0, 0.1), cluster.sigma=rep(1, length(y))+rnorm(1, 0, 0.1))} # Parameters to estimate params <- c("cluster.mean", "cluster.sigma", "weight", "alpha", "z", "n.cluster") # MCMC settings nc <- 3 # Number of chains ni <- 25000 # Number of draws from posterior for each chain nb <- 15000 # Number of draws to discard as burn-in nthin <- 10 # Thinning rate # Load R2jags library("R2jags") # Call JAGS outj <- jags(jags.data, inits=inits, parameters.to.save=params, model.file="mixture_8.txt", n.thin=nthin, n.chains=nc, n.burnin=nb, n.iter=ni) print(outj,dig=3) traceplot(outj) samples <- outj$BUGSoutput print(samples$mean,dig=3) #load(mixture_8_May_6_2012.Rdata) # compare with true values round(samples$mean$n.active.clust, 2) round(samples$mean$cluster.mean, 2) mu round(samples$mean$cluster.sigma, 2) sigma round(samples$mean$weight, 2) w plot(samples$mean$cluster.mean[1:3], 1:3, xlim=range(y), col="red", pch=19, cex=3, xlab="Observations y", ylab="Mean of the classification of y (groups 1 vs. 2 vs. 3)") for (i in 1:3) { segments(samples$mean$cluster.mean[i]-2*samples$mean$cluster.sigma[i], i, samples$mean$cluster.mean[i]+2*samples$mean$cluster.sigma[i], i, col="red", lwd=7.5) } points(y, apply(samples$sims.list$z, 2, median), pch="+", col="blue", cex=1.3) # We do not really need to go with the full DPM and have one cluster per point; this is the worst case scenario and often a (much) smaller number of clusters is enough. # Moreover, estimation is (much) faster if we reduce the number of clusters. cat("model{ for (i in 1:n.obs) { y[i] ~ dnorm(cluster.mean[z[i]], cluster.tau[z[i]]) z[i] ~ dcat(weight[]) } # We order the weights in this case from the largest to the smallest for identifiability for (j in 1:(n.clust-1)) { q[j] ~ dbeta(1, alpha) } weight.ini[1] <- q[1] for (j in 2:(n.clust-1)) { weight.ini[j] <- q[j] * (1-q[j-1]) * weight.ini[j-1] / q[j-1] } weight.ini.sum <- sum(weight.ini[1:(n.clust-1)]) weight.ini[n.clust] <- 1-weight.ini.sum sorted.weights <- sort(weight.ini[1:n.clust]) for (j in 1:n.clust) { weight[j] <- sorted.weights[(n.clust-j)+1] } # Priors for (j in 1:n.clust) { cluster.mean[j] ~ dnorm(mu0, tau0) cluster.tau[j] <- 1/(cluster.sigma[j]*cluster.sigma[j]) cluster.sigma[j] ~ dunif(0, 3) } mu0 <- 0 tau0 <- 1/(sigma0*sigma0) sigma0 <- 7 alpha ~ dunif(0.3, 4) # # Summary statistics # for (i in 1:n.obs) { # for (j in 1:n.clust) { # obs.by.cluster[i,j] <- equals(z[i], j) # } # } # for (j in 1:n.clust) { # active.clust[j] <- step(sum(obs.by.cluster[,j])-1) # } # n.active.clust <- sum(active.clust[]) }", fill=TRUE, file="mixture_9.txt") # Generate data w <- rev(c(0.1, 0.3, 0.6)) mu <- rev(c(-3, 1, 4.5)) sigma <- rev(c(0.1, 0.15, 0.2)) n.obs <- 100 y <- vector(length=n.obs) for (i in 1:n.obs) { sample.draw <- t(rmultinom(1, size=1, prob=w)) y[i] <- rnorm(1, as.numeric(sample.draw %*% mu), as.numeric(sample.draw %*% sigma)) } # Look at the data hist(y, col="lightblue", freq=FALSE, breaks=30, ylim=range(0, 1.5)) lines(density(y), col="blue", lwd=2) temp <- seq(min(y), max(y), length.out=1000) lines(temp, (w[1]*dnorm(temp, mu[1], sigma[1]))+(w[2]*dnorm(temp, mu[2], sigma[2]))+(w[3]*dnorm(temp, mu[3], sigma[3])), col="black", lty=2, lwd=2) # Assemble data n.clust <- 7 jags.data <- list(n.obs=as.numeric(length(y)), y=as.numeric(y), n.clust=n.clust) # Inits function inits <- function() {list(alpha=1+rnorm(1, 0, 0.1), cluster.sigma=rep(1, n.clust)+rnorm(1, 0, 0.1))} # Parameters to estimate params <- c("cluster.mean", "cluster.sigma", "weight", "alpha", "z") # MCMC settings nc <- 3 # Number of chains ni <- 450000 # Number of draws from posterior for each chain nb <- 250000 # Number of draws to discard as burn-in nthin <- 200 # Thinning rate # Load R2jags library("R2jags") # Call JAGS outj <- jags(jags.data, inits=inits, parameters.to.save=params, model.file="mixture_9.txt", n.thin=nthin, n.chains=nc, n.burnin=nb, n.iter=ni) print(outj,dig=3) traceplot(outj) samples <- outj$BUGSoutput print(samples$mean,dig=3) # compare with true values round(samples$mean$cluster.mean, 2) mu round(samples$mean$cluster.sigma, 2) sigma round(samples$mean$weight, 2) w plot(samples$mean$cluster.mean[1:3], 1:3, xlim=range(y), col="red", pch=19, cex=3, xlab="Observations y", ylab="Mean of the classification of y (groups 1 vs. 2 vs. 3)") for (i in 1:3) { segments(samples$mean$cluster.mean[i]-2*samples$mean$cluster.sigma[i], i, samples$mean$cluster.mean[i]+2*samples$mean$cluster.sigma[i], i, col="red", lwd=7.5) } points(y, samples$mean$z, pch="+", col="blue", cex=1.3) # Doing it with the "DPpackage" #install.packages("DPpackage") library("DPpackage") # Generate data w <- rev(c(0.1, 0.3, 0.6)) mu <- rev(c(-3, 1, 4.5)) sigma <- rev(c(0.1, 0.15, 0.2)) n.obs <- 100 y <- vector(length=n.obs) for (i in 1:n.obs) { sample.draw <- t(rmultinom(1, size=1, prob=w)) y[i] <- rnorm(1, as.numeric(sample.draw %*% mu), as.numeric(sample.draw %*% sigma)) } # Look at the data hist(y, col="lightblue", freq=FALSE, breaks=30, ylim=range(0, 1.5)) lines(density(y), col="blue", lwd=2) temp <- seq(min(y), max(y), length.out=1000) lines(temp, (w[1]*dnorm(temp, mu[1], sigma[1]))+(w[2]*dnorm(temp, mu[2], sigma[2]))+(w[3]*dnorm(temp, mu[3], sigma[3])), col="black", lty=2, lwd=2) # Running DPpackage # Initial state state <- NULL # MCMC parameters nburn <- 1000 nsave <- 10000 nskip <- 10 ndisplay <- 100 mcmc <- list(nburn=nburn,nsave=nsave,nskip=nskip,ndisplay=ndisplay) # Example of Prior information 1: fixing alpha, m1, and Psi1 prior1 <- list(alpha=1, m1=rep(0,1), psiinv1=diag(0.5,1), nu1=4, tau1=1,tau2=100) fit1 <- DPdensity(y=y, prior=prior1, mcmc=mcmc, state=state, status=TRUE) # Posterior means fit1 # Plot the estimated density plot(fit1, ask=FALSE) # Extracting the density estimate cbind(fit1$x1, fit1$dens) # Plot the parameters (to see the plots gradually set ask=TRUE) plot(fit1, ask=TRUE, output="param") # Plot the a specific parameters plot(fit1, ask=FALSE, output="param", param="ncluster", nfigr=1, nfigc=2) # Extracting the posterior mean of the specific # means and covariance matrices DPrandom(fit1) # Ploting predictive information about the specific # means and covariance matrices # with HPD and Credibility intervals (to see the plots gradually set ask=TRUE) plot(DPrandom(fit1, predictive=TRUE), ask=FALSE) # Ploting information about all the specific means # and covariance matrices with HPD and Credibility intervals (to see the plots gradually set ask=TRUE) plot(DPrandom(fit1), ask=TRUE) # Example of Prior information 2: alpha is random, everything else is just as before prior2 <- list(a0=2, b0=1, m1=rep(0,1), psiinv1=diag(0.5,1), nu1=4, tau1=1,tau2=100) fit2 <- DPdensity(y=y, prior=prior2, mcmc=mcmc, state=state, status=TRUE) # Posterior means fit2 # Plot the estimated density plot(fit2, ask=FALSE) # Extracting the density estimate cbind(fit2$x1, fit2$dens) # Plot the parameters (to see the plots gradually set ask=TRUE) plot(fit2, ask=TRUE, output="param") # Plot the a specific parameters plot(fit2, ask=FALSE, output="param", param="ncluster", nfigr=1, nfigc=2) plot(fit2, ask=FALSE, output="param", param="alpha", nfigr=1, nfigc=2) # Extracting the posterior mean of the specific # means and covariance matrices DPrandom(fit2) # Ploting predictive information about the specific # means and covariance matrices # with HPD and Credibility intervals (to see the plots gradually set ask=TRUE) plot(DPrandom(fit2, predictive=TRUE), ask=FALSE) # Ploting information about all the specific means # and covariance matrices with HPD and Credibility intervals (to see the plots gradually set ask=TRUE) plot(DPrandom(fit2), ask=TRUE) # Let us see how DPM models fare with other kinds of data sets # 1. let's make the clusters less distinct # Generate data w <- c(0.35, 0.25, 0.4) mu <- c(-1, 1, 2) sigma <- c(0.7, 0.75, 0.7) n.obs <- 100 y <- vector(length=n.obs) for (i in 1:n.obs) { sample.draw <- t(rmultinom(1, size=1, prob=w)) y[i] <- rnorm(1, as.numeric(sample.draw %*% mu), as.numeric(sample.draw %*% sigma)) } # Look at the data hist(y, col="lightblue", freq=FALSE, breaks=30, ylim=range(0, 1.5)) lines(density(y), col="blue", lwd=2) temp <- seq(min(y), max(y), length.out=1000) lines(temp, (w[1]*dnorm(temp, mu[1], sigma[1]))+(w[2]*dnorm(temp, mu[2], sigma[2]))+(w[3]*dnorm(temp, mu[3], sigma[3])), col="black", lty=2, lwd=2) # Running DPpackage library("DPpackage") # Initial state state <- NULL # MCMC parameters nburn <- 1000 nsave <- 10000 nskip <- 10 ndisplay <- 100 mcmc <- list(nburn=nburn,nsave=nsave,nskip=nskip,ndisplay=ndisplay) prior2 <- list(a0=2, b0=1, m1=rep(0,1), psiinv1=diag(0.5,1), nu1=4, tau1=1,tau2=100) fit2 <- DPdensity(y=y, prior=prior2, mcmc=mcmc, state=state, status=TRUE) # Posterior means fit2 # Plot the estimated density and the density line for the true underlying mixture plot(fit2, ask=FALSE) lines(temp, (w[1]*dnorm(temp, mu[1], sigma[1]))+(w[2]*dnorm(temp, mu[2], sigma[2]))+(w[3]*dnorm(temp, mu[3], sigma[3])), col="blue", lty=3, lwd=2) # Plot alpha and ncluster parameters plot(fit2, ask=FALSE, output="param", param="ncluster", nfigr=1, nfigc=2) plot(fit2, ask=FALSE, output="param", param="alpha", nfigr=1, nfigc=2) # Ploting predictive information about the specific # means and covariance matrices # with HPD and Credibility intervals plot(DPrandom(fit2, predictive=TRUE), ask=TRUE) # 2. let's add more clusters # Generate data w <- c(0.2, 0.2, 0.3, 0.3) mu <- c(-1, 1, 2, 5) sigma <- c(0.7, 0.75, 0.7, 1.3) n.obs <- 100 y <- vector(length=n.obs) for (i in 1:n.obs) { sample.draw <- t(rmultinom(1, size=1, prob=w)) y[i] <- rnorm(1, as.numeric(sample.draw %*% mu), as.numeric(sample.draw %*% sigma)) } # Look at the data hist(y, col="lightblue", freq=FALSE, breaks=30, ylim=range(0, 1.5)) lines(density(y), col="blue", lwd=2) temp <- seq(min(y), max(y), length.out=1000) lines(temp, (w[1]*dnorm(temp, mu[1], sigma[1]))+(w[2]*dnorm(temp, mu[2], sigma[2]))+(w[3]*dnorm(temp, mu[3], sigma[3]))+(w[4]*dnorm(temp, mu[4], sigma[4])), col="black", lty=2, lwd=2) # Running DPpackage library("DPpackage") # Initial state state <- NULL # MCMC parameters nburn <- 1000 nsave <- 10000 nskip <- 10 ndisplay <- 100 mcmc <- list(nburn=nburn,nsave=nsave,nskip=nskip,ndisplay=ndisplay) prior2 <- list(a0=2, b0=1, m1=rep(0,1), psiinv1=diag(0.5,1), nu1=4, tau1=1,tau2=100) fit2 <- DPdensity(y=y, prior=prior2, mcmc=mcmc, state=state, status=TRUE) # Posterior means fit2 # Plot the estimated density and the density line for the true underlying mixture plot(fit2, ask=FALSE) lines(temp, (w[1]*dnorm(temp, mu[1], sigma[1]))+(w[2]*dnorm(temp, mu[2], sigma[2]))+(w[3]*dnorm(temp, mu[3], sigma[3]))+(w[4]*dnorm(temp, mu[4], sigma[4])), col="blue", lty=3, lwd=2) # Plot alpha and ncluster parameters plot(fit2, ask=FALSE, output="param", param="ncluster", nfigr=1, nfigc=2) plot(fit2, ask=FALSE, output="param", param="alpha", nfigr=1, nfigc=2) # Ploting predictive information about the specific means and covariance matrices with HPD and Credibility intervals plot(DPrandom(fit2, predictive=TRUE), ask=FALSE)