##################################
### MEETING 9                  ###
### February 5, 2013     	   ###
### From Significance testing  ###
### To Statistical models	   ###
##################################

# Source the course functions
source("http://people.ucsc.edu/~mwagers/ling280/notes/ling280.R")
# Import today's data set
load(url("http://people.ucsc.edu/~mwagers/ling280/data/meeting7.Rdata"))
# Libs
library(gdata)

###
### FORMULATING A STATISTICAL MODEL
### ANALYSIS OF VARIANCE
###


### ACCEPTABILITY RATING STUDY
### RATE CENTER-EMBEDDED SENTENCES in a MAGNITUDE-ESTIMATION TASK
###
###	CE.all: grammatically "correct" CE (3 NPs, 3 VPs)
###	CE.vp1: Only 1st VP (interior VP) missing
###	CE.vp2: Only 2nd VP (medial VP) missing
###	CE.vp3: Only 3rd VP (peripheral VP) missing


CreateSample <- function(ce.data, n.participants=15){

  # draw 15 participant names from the participant category with the levels() function
  # in this case, we don't set replace=TRUE; we should but it leads to some complications about labeling that we don't need to worry about right now
  my.participants <- sample(levels(ce$participant), size = n.participants) 
  # create a dataframe that only includes those participants
  ce.sample <- subset(ce, participant %in% my.participants)
  # R needs to forget there were other participant names ... drop.levels does this
  ce.sample <- drop.levels(ce.sample) 

  my.sample <- with(ce.sample, tapply(mescore, list(participant, condition), mean))

  return(my.sample)

}

AnovaDecompose<-function(my.data.table, return.table="grand"){
	nrow(my.data.table) -> my.rows
	ncol(my.data.table) -> my.cols
	
	grand.mean <- mean(my.data.table)
	col.means <- apply(my.data.table,2,mean)
	row.means <- apply(my.data.table,1, mean)
	
	
	col.means.matrix <- matrix(rep(col.means, times=my.rows), nrow = my.rows, byrow=TRUE)
	grand.means.matrix <- matrix(rep(grand.mean, my.cols*my.rows), nrow = my.rows)
	row.means.matrix <- matrix(rep(row.means, times=my.cols), nrow = my.rows, byrow=FALSE)

  if(return.table=="grand"){
    return(grand.means.matrix)
  } else if(return.table=="cols"){
    return(col.means.matrix)
  } else { return(row.means.matrix) }

}

n.participants <- 20
CreateSample(ce,20)->my.sample

# JUST ERROR: x[i,j] = mu + epsilon
AnovaDecompose(my.sample, return.table="grand")->my.grand.mean

my.sample - my.grand.mean -> model.1.error
model.1 <- my.grand.mean + model.1.error
round(model.1, 4) == round(my.sample, 4)

# GROUP MEANS: x[i,j] = mu + alpha_j + epsilon
# alpha_j = condition.means - mu
AnovaDecompose(my.sample, return.table="cols") -> my.col.means
my.col.means - my.grand.mean -> my.col.effect

model.2.error <- my.sample - my.grand.mean - my.col.effect
model.2 <- my.grand.mean + my.col.effect + model.2.error
round(model.2, 4) == round(my.sample, 4)

#ANOVA

total <- model.1.error

between <- my.col.effect
within <- model.2.error

#
sum(total)
sum(between)
sum(within)

#
sum(total^2) -> ss.total
sum(between^2) -> ss.between
sum(within^2) -> ss.within
# notice anything
> ...

df.total <- length(my.sample) - 1
df.between <- ncol(my.sample) - 1
df.within <- df.total - df.between

ms.total <- ss.total/df.total
ms.between <- ss.between/df.between
ms.within <- ss.within/df.within

my.sample.F <- ms.between / ms.within


### WHAT DISTRIBUTION DOES F FOLLOW?

# WHY? THE F DISTRIBUTION!

integrate(function(x) df(x, df.between, df.within), my.sample.F, Inf)

# TAKE HOME: Do the Simulation in Section 5.3.1

# HOW TO DO IT USING FORMULA NOTION and aov()
# SET UP a DATA FRAME

conditions <- matrix(rep(colnames(my.sample), n.participants), ncol=4, byrow=TRUE)
subjects <- matrix(rep(rownames(my.sample), 4), ncol=4, byrow=FALSE)

my.sample.df<-data.frame(rating = as.vector(my.sample), 
  condition = as.vector(conditions), 
  subject = as.vector(subjects))

# AOV call
my.sample.aov <- aov(rating ~ condition, data = my.sample.df)
summary(my.sample.aov)

#
my.row.effect<-AnovaDecompose(my.sample,return.table="rows") - AnovaDecompose(my.sample)

my.sample.2 <- my.sample - my.row.effect
my.sample.df.2 <- data.frame(rating = as.vector(my.sample.2), 
  condition = as.vector(conditions), 
  subject = as.vector(subjects))

my.sample.aov.2 <- aov(rating ~ condition, data = my.sample.df.2)
summary(my.sample.aov.2)