################################## ### MEETING 9 ### ### February 5, 2013 ### ### From Significance testing ### ### To Statistical models ### ################################## # Source the course functions source("http://people.ucsc.edu/~mwagers/ling280/notes/ling280.R") # Import today's data set load(url("http://people.ucsc.edu/~mwagers/ling280/data/meeting7.Rdata")) # Libs library(gdata) ### ### FORMULATING A STATISTICAL MODEL ### ANALYSIS OF VARIANCE ### ### ACCEPTABILITY RATING STUDY ### RATE CENTER-EMBEDDED SENTENCES in a MAGNITUDE-ESTIMATION TASK ### ### CE.all: grammatically "correct" CE (3 NPs, 3 VPs) ### CE.vp1: Only 1st VP (interior VP) missing ### CE.vp2: Only 2nd VP (medial VP) missing ### CE.vp3: Only 3rd VP (peripheral VP) missing CreateSample <- function(ce.data, n.participants=15){ # draw 15 participant names from the participant category with the levels() function # in this case, we don't set replace=TRUE; we should but it leads to some complications about labeling that we don't need to worry about right now my.participants <- sample(levels(ce$participant), size = n.participants) # create a dataframe that only includes those participants ce.sample <- subset(ce, participant %in% my.participants) # R needs to forget there were other participant names ... drop.levels does this ce.sample <- drop.levels(ce.sample) my.sample <- with(ce.sample, tapply(mescore, list(participant, condition), mean)) return(my.sample) } AnovaDecompose<-function(my.data.table, return.table="grand"){ nrow(my.data.table) -> my.rows ncol(my.data.table) -> my.cols grand.mean <- mean(my.data.table) col.means <- apply(my.data.table,2,mean) row.means <- apply(my.data.table,1, mean) col.means.matrix <- matrix(rep(col.means, times=my.rows), nrow = my.rows, byrow=TRUE) grand.means.matrix <- matrix(rep(grand.mean, my.cols*my.rows), nrow = my.rows) row.means.matrix <- matrix(rep(row.means, times=my.cols), nrow = my.rows, byrow=FALSE) if(return.table=="grand"){ return(grand.means.matrix) } else if(return.table=="cols"){ return(col.means.matrix) } else { return(row.means.matrix) } } n.participants <- 20 CreateSample(ce,20)->my.sample # JUST ERROR: x[i,j] = mu + epsilon AnovaDecompose(my.sample, return.table="grand")->my.grand.mean my.sample - my.grand.mean -> model.1.error model.1 <- my.grand.mean + model.1.error round(model.1, 4) == round(my.sample, 4) # GROUP MEANS: x[i,j] = mu + alpha_j + epsilon # alpha_j = condition.means - mu AnovaDecompose(my.sample, return.table="cols") -> my.col.means my.col.means - my.grand.mean -> my.col.effect model.2.error <- my.sample - my.grand.mean - my.col.effect model.2 <- my.grand.mean + my.col.effect + model.2.error round(model.2, 4) == round(my.sample, 4) #ANOVA total <- model.1.error between <- my.col.effect within <- model.2.error # sum(total) sum(between) sum(within) # sum(total^2) -> ss.total sum(between^2) -> ss.between sum(within^2) -> ss.within # notice anything > ... df.total <- length(my.sample) - 1 df.between <- ncol(my.sample) - 1 df.within <- df.total - df.between ms.total <- ss.total/df.total ms.between <- ss.between/df.between ms.within <- ss.within/df.within my.sample.F <- ms.between / ms.within ### WHAT DISTRIBUTION DOES F FOLLOW? # WHY? THE F DISTRIBUTION! integrate(function(x) df(x, df.between, df.within), my.sample.F, Inf) # TAKE HOME: Do the Simulation in Section 5.3.1 # HOW TO DO IT USING FORMULA NOTION and aov() # SET UP a DATA FRAME conditions <- matrix(rep(colnames(my.sample), n.participants), ncol=4, byrow=TRUE) subjects <- matrix(rep(rownames(my.sample), 4), ncol=4, byrow=FALSE) my.sample.df<-data.frame(rating = as.vector(my.sample), condition = as.vector(conditions), subject = as.vector(subjects)) # AOV call my.sample.aov <- aov(rating ~ condition, data = my.sample.df) summary(my.sample.aov) # my.row.effect<-AnovaDecompose(my.sample,return.table="rows") - AnovaDecompose(my.sample) my.sample.2 <- my.sample - my.row.effect my.sample.df.2 <- data.frame(rating = as.vector(my.sample.2), condition = as.vector(conditions), subject = as.vector(subjects)) my.sample.aov.2 <- aov(rating ~ condition, data = my.sample.df.2) summary(my.sample.aov.2)