################################## ### MEETING 7-8 ### ### January 29, 2013 ### ### From Significance testing ### ### To Statistical models ### ################################## #### R WARM UP #### Learning about tapply() and lattice graphing # Source the course functions source("http://people.ucsc.edu/~mwagers/ling280/notes/ling280.R") # Load the 'lattice' library # install.packages("lattice") if you don't have it ... library(lattice) #install.packages("gdata") library(gdata) # Import today's data set load(url("http://people.ucsc.edu/~mwagers/ling280/data/meeting7.Rdata")) ls() ### PLAUSIBILITY NORMING STUDY ### RATE PLAUSIBILITY OF SENTENCES IN THE FOLLOWING CONDITIONS ### ### 1 The FBI illegally detained the immigrant. ### 2 The FBI illegally detained the boat. ### 3 The FBI illegally interrogated the immigrant. ### 4 The FBI illegally interrogated the boat. # What is the experimental design? summary(ratings) # ... display outcome BY condition number (arbitrary) with(ratings, tapply (Rating, Category, mean)) -> ratings.table # ... display outcome BY experimental design (structured) with(ratings, tapply ( Rating, list ( ..., ... ) ,mean)) -> ratings.summary print(round(ratings.summary, digits=2)) # with(ratings,boxplot(Rating~DP*V)) # A fancy plot! # First, refresh graphics dev.off() quartz() # Second, plotting by experimental design # Note the call to `histogram` and not `hist` - # `histogram` is the lattice function: histogram( ~ Rating | V * DP, data = ratings, breaks=seq(0,5,1), col="dodgerblue", type="density") #### /R WARM UP ##### # Central Limit Theorem one more time -- we'll prob. skip this in class # But it might help you write your 3rd Problem Set # # Show that the sample mean of (non-normally distributed ratings) follows subset(ratings, DP=="Inanimate" & V=="D.ANIM")->sample.df sample.df$Rating -> sample.ratings multiplot(2,2) hist(sample.ratings,breaks=seq(0,5,1),col="cornflowerblue",border=1,main="Observed ratings") qqnorm(sample.ratings, col="mediumseagreen",main="Observed ratings") simulateExperiment(sample.ratings,Nobs=144,Nsim=5000)->sim.ratings.means hist(sim.ratings.means,breaks=10,col="palevioletred2",border=1,main="Sampling distribution") # Hmm ... what do these guys do? qqnorm(sim.ratings.means, col="mistyrose4",main="Sampling distribution") qqline(sim.ratings.means, lwd = 2) sd(sample.means) # ##### ### STANDARD ERROR FUNCTION ### With some error-checking: see VB p. 58-59 se<-function(x) { y <- x[!is.na(x)] sqrt(var(as.vector(y))/length(y)) } ### ### 95% CONFIDENCE INTERVAL ### ci<-function(x, size=0.95) { # # ARGS: # # RETURNS: m<-mean(x,na.rm=TRUE) standard.error<-se(x) len<-length(x) upper<-m+qt(size+(1-size)/2,df=len-1)*standard.error lower<-m+qt((1-size)/2,df=len-1)*standard.error return(data.frame(lower=lower,upper=upper)) } ### PULL OUT the VP=+ANIM CONDITIONS subset(ratings, V=="D.ANIM" & DP=="ANIM")$Rating->V.anim.DP.anim subset(ratings, V=="D.ANIM" & DP=="ANIM")$Rating->V.anim.DP.inanim ### ### RELATIONSHIP BETWEEN CONFIDENCE INTERVAL AND POPULATION MEAN ### # initialize some empty vectors n.sim <- 100 insideCI<-vector(length=n.sim); lower<-vector(length=n.sim); upper<-vector(length=n.sim); population.mean<-mean(V.anim.DP.anim); population.sd<-sd(V.anim.DP.anim); # for(i in 1:n.sim){ exp<-sample(V.anim.DP.anim, 15) lower[i]<-ci(exp, size=0.95)$lower upper[i]<-ci(exp, size=0.95)$upper if(lower[i] < population.mean & upper[i] > population.mean) { insideCI[i] <-TRUE } else{ insideCI[i] <-FALSE } } # prop.table(table(insideCI)) # or ... mean(insideCI) ### ONE SAMPLE T-TEST tscore <- (mean(V.anim.DP.anim))/(se(V.anim.DP.anim)) sample.size <- length(V.anim.DP.anim) pt(tscore, df = (sample.size-1)) > .... ### TWO SAMPLE T-TEST sample.size <- 15 sample(V.anim.DP.anim, sample.size) -> my.experiment.cond1 sample(V.anim.DP.inanim, sample.size) -> my.experiment.cond2 d <- mean(my.experiment.cond1) - mean(my.experiment.cond2) # Satterthwaite-corrected s.e. for low n, unequal variance s.pooled <- sqrt(sd(my.experiment.cond1)^2/sample.size + sd(my.experiment.cond2)^2/sample.size ) t.score <- d/s.pooled; pt(t.score, df = ...) p.val <- 2*(1-pt(t.score, df = ...)) t.test(my.experiment.cond1, my.experiment.cond2) ### HMMM.... # How many pair-wise differences are there among n conditions? # HANDSHAKE PROBLEM handshake <- function(n.hands) { sum(seq(0, n.hands - 1)) } # note: n.hands - 1 ... you don't shake your own hand ... seq(1, 12) -> many.hands many.hands -> many.handshakes for(i in 1:length(many.hands)) { handshake(many.hands[i]) -> many.handshakes[i] } plot(many.hands, many.handshakes) ### ### FORMULATING A STATISTICAL MODEL ### ANALYSIS OF VARIANCE ### ### ACCEPTABILITY RATING STUDY ### RATE CENTER-EMBEDDED SENTENCES in a MAGNITUDE-ESTIMATION TASK ### ### CE.all: grammatically "correct" CE (3 NPs, 3 VPs) ### CE.vp1: Only 1st VP (interior VP) missing ### CE.vp2: Only 2nd VP (medial VP) missing ### CE.vp3: Only 3rd VP (peripheral VP) missing # TWO SAMPLE T-TEST COMPARING CE.all to CE.vp2 t.test(ce$mescore[ce$condition=="CE.all"], ce$mescore[ce$condition=="CE.vp2"]) # treat 'participant' as a category, not a number factor(ce$participant) -> ce$participant # prepare an experiment with 10 participants n.participants <- 12 # draw 10 participant names from the participant category with the levels() function # in this case, we don't set replace=TRUE; we should but it leads to some complications about labeling that we don't need to worry about right now my.participants <- sample(levels(ce$participant), size = n.participants) # create a dataframe that only includes those participants ce.sample <- subset(ce, participant %in% my.participants) # R needs to forget there were other participant names ... drop.levels does this ce.sample <- drop.levels(ce.sample) my.sample <- with(ce.sample, tapply(mescore, list(participant, condition), mean)) # calculate row/col and grand mean grand.mean <- mean(my.sample) condition.means <- apply(my.sample, 2, mean) subject.means <- apply(my.sample, 1, mean) # create matrices to match number of observations grand.mean.matrix <- matrix(rep(grand.mean, n.participants*4), nrow = n.participants) condition.means.matrix <- matrix(rep(condition.means, times = n.participants), nrow = n.participants, byrow=TRUE) # JUST ERROR: x[i,j] = mu + epsilon model.1.error <- my.sample - grand.mean.matrix model.1 <- grand.mean.matrix + model.1.error round(model.1, 4) == round(my.sample, 4) # GROUP MEANS: x[i,j] = mu + alpha_j + epsilon # alpha_j = condition.means - mu condition.means.matrix <- (condition.means.matrix - grand.mean) model.2.error <- my.sample - grand.mean.matrix - condition.means.matrix model.2 <- grand.mean + condition.means.matrix + model.2.error round(model.2, 4) == round(my.sample, 4) #ANOVA total <- model.1.error between <- condition.means.matrix within <- model.2.error # sum(total) sum(between) sum(within) # sum(total^2) -> ss.total sum(between^2) -> ss.between sum(within^2) -> ss.within # notice anything > ... df.total <- length(my.sample) - 1 df.between <- length(condition.means) - 1 df.within <- df.total - df.between ms.total <- ss.total/df.total ms.between <- ss.between/df.between ms.within <- ss.within/df.within my.sample.F <- ms.between / ms.within ### WHAT DISTRIBUTION DOES F FOLLOW? # WHY? THE F DISTRIBUTION! integrate(function(x) df(x, df.between, df.within), my.sample.F, INF) # TAKE HOME: Do the Simulation in Section 5.3.1 # HOW TO DO IT USING FORMULA NOTION and aov() # SET UP a DATA FRAME conditions<-matrix(rep(colnames(my.sample), n.participants), ncol=4, byrow=TRUE) subjects<-matrix(rep(rownames(my.sample), 4), nrow=n.participants, byrow=FALSE) my.sample.df<-data.frame(rating = as.vector(my.sample), condition = as.vector(conditions), subject = as.vector(subjects)) # AOV call my.sample.aov <- aov(rating ~ condition, data = my.sample.df) summary(my.sample.aov)