#### MEETING 4 #### #### NORMAL DISTRIBUTION #### LAW of LARGE NUMBERS & CENTRAL LIMIT THEOREM #### CONFIDENCE SETS #### source("~/Desktop/ling280.R") #### DATA #### http://people.ucsc.edu/~mwagers/ling280/data/chamorro.Rdata load("chamorro.Rdata) #### Two data frames - judgment times in classifying good/bad extractions #### poss$RT / RTs to possessor extractions #### arg$RT / RTs to verb argument extractions hist(arg$RT, breaks=100) ####################################### ## DISTRIBUTION OF THE SAMPLE MEAN ## ####################################### ### SET UP A SIMULATION THAT DRAWS FROM arg$RT ### 100 experiments, 20 observations n.sim <- 100 mean.sample<-vector("numeric", length=n.sim) for(i in 1:n.sim){ sample(arg$RT, size=40, replace=TRUE)->rt.sample mean(rt.sample) -> mean.sample[i] } ## plot(1:100, mean.sample, ylim=c(0, 4000)) abline(h = mean(arg$RT)) ### Re-compute 'mean.sample' with size = 40 and size = 160 ### Overplot the new mean.sample on the existing plot points(1:100, mean.sample, col="red") simulateExperiment<-function(my.population, n.obs = 40, n.sim = 100000){ # Simulate any number of experiments, of any sample size # With any vector data # # ARGS: n.obs - number of observations per experiment # n.sim - number of experiments # # RETURN: vector of each experiment's mean mean.sample <- vector("numeric", length=n.sim) for(i in 1:n.sim){ sample(my.population, size=n.obs, replace=TRUE)->rt.sample mean(rt.sample) -> mean.sample[i] } return(mean.sample) } ### SAMPLING DISTRIBUTION OF THE SAMPLE MEAN ### RELATIONSHIPS WITH SAMPLE SIZE ### Illustrates the Central Limit Theorem with a given 'population' clt.population <- arg$RT clt.population <- rexp(10000, rate=2) # A set of sample sizes n.obs <- 2^seq(3,11) sim.size <- 10000 # Create a plotting grid with reasonable dimensions ncol <- ceiling(sqrt(length(n.obs))) nrow <- ceiling(length(n.obs) / ncol) multiplot(nrow, ncol) # Figure out good xlim min.se <- sd(clt.population)/sqrt(min(n.obs)) pop.mean <- mean(clt.population) max.xlim <- c(min(pop.mean - 5 * min.se, min(clt.population)), pop.mean + 5 * min.se) # Simulate and plot outcomes for(i in n.obs) { simulateExperiment(clt.population, n.obs=i, n.sim=sim.size) -> sim.experiment hist(sim.experiment, main=paste("n =",i), xlab=paste("skew = ", round(skew(sim.experiment),2)), breaks=50, xlim=max.xlim) print(shapiro.test(sample(sim.experiment, 5000, replace=TRUE))) } ### ### ### sampling.sd <- vector(length=10) k<-1; seq(10,500,50)->n.obs for(i in n.obs){ simulateExperiment(arg$RT, n.obs=i,n.sim=10000)->sim.experiment sd(sim.experiment)->sampling.sd[k] k<-k+1; } plot(n.obs, sampling.sd,xlab="n.obs",ylab="sd(sample)", pch=20,cex=2) lines(n.obs, sd(rt.plurals)/sqrt(n.obs),lwd=2,col="red") ### Normal distribution ### ### my.norm.pdf <- function( ) { # # # ARGS: # # RETURNS: A <- sigma * sqrt(2*pi) B <- (-1/2) * (( x - mu ) / sigma)^2 Px <- A^-1 * exp(B) return(...) } x <- seq(-5, 5, by=0.01) plot(x, my.norm.pdf(x), type='l', lwd=2, col="red") hist(rnorm(1e6), freq=FALSE, breaks=1e2, add=1) # CDF # # # 1 mu<-100 sigma<-10 n.obs<-10000 rnorm(n.obs, mean = mu, sd = sigma) -> norm.1 ecdf(norm.1) -> norm.1.ecdf; seq(-5, 5) -> x.sd min.x <- mu - 5 * sigma max.x <- mu + 5 * sigma plot(x.sd, norm.1.ecdf(seq(min.x, max.x, by=sigma)), xlab="SD",ylab="P(X norm.2 ecdf(norm.2) -> norm.2.ecdf; min.x <- mu - 5 * sigma max.x <- mu + 5 * sigma lines(x.sd, norm.2.ecdf(seq(min.x, max.x, by=sigma)), col="red", type='b') # 3 mu<-5039 sigma<-232 rnorm(n.obs, mean = mu, sd = sigma) -> norm.3 ecdf(norm.3) -> norm.3.ecdf; min.x <- mu - 5 * sigma max.x <- mu + 5 * sigma lines(x.sd, norm.3.ecdf(seq(min.x, max.x, by=sigma)), col="blue", type='b') ### rnorm( 10000, 0, 1 ) -> norm.standard quantile(norm.standard,c(0.005,0.025,0.05,0.165,0.5,0.835,0.95,0.975,0.995))->norm.quants print(round(norm.quants,digits=2)) rnorm( 200, 50, 25 ) -> norm.4 se.n4 <- sd(norm.4)/sqrt(length(norm.4)) ci.95 <- c( mean(norm.4) - 2 * se.n4 , mean(norm.4) + 2 * se.n4 ) ###################### ### T DISTRIBUTION ## ###################### qt(0.025, df = 1:50) rnorm( 20, 50, 25 ) -> norm.5 qt(c(0.025,0.975), df = 20 - 1) -> t.crit ### CONFIDENCE INTERVAL se.n5 <- sd(norm.5)/sqrt(length(norm.5)) ci.95 <- mean(norm.5) + t.crit * se.n5 #### t.test(norm.5)