#### MEETING 4
####
#### NORMAL DISTRIBUTION
#### LAW of LARGE NUMBERS & CENTRAL LIMIT THEOREM
#### CONFIDENCE SETS
####

source("~/Desktop/ling280.R")

#### DATA
#### http://people.ucsc.edu/~mwagers/ling280/data/chamorro.Rdata

load("chamorro.Rdata)

#### Two data frames - judgment times in classifying good/bad extractions
#### poss$RT / RTs to possessor extractions
#### arg$RT  / RTs to verb argument extractions


hist(arg$RT, breaks=100)


#######################################
##  DISTRIBUTION OF THE SAMPLE MEAN  ##
#######################################

### SET UP A SIMULATION THAT DRAWS FROM arg$RT
### 100 experiments, 20 observations

n.sim <- 100
mean.sample<-vector("numeric", length=n.sim)

for(i in 1:n.sim){
	sample(arg$RT, size=40, replace=TRUE)->rt.sample
	mean(rt.sample) -> mean.sample[i]
}

##

plot(1:100, mean.sample, ylim=c(0, 4000))
abline(h = mean(arg$RT))

### Re-compute 'mean.sample' with size = 40 and size = 160
### Overplot the new mean.sample on the existing plot

points(1:100, mean.sample, col="red")


simulateExperiment<-function(my.population, n.obs = 40, n.sim = 100000){
# Simulate any number of experiments, of any sample size
# With any vector data
#
# ARGS: n.obs - number of observations per experiment
#       n.sim - number of experiments
#
# RETURN: vector of each experiment's mean 

  mean.sample <- vector("numeric", length=n.sim)
 
  for(i in 1:n.sim){
    sample(my.population, size=n.obs, replace=TRUE)->rt.sample
	  mean(rt.sample) -> mean.sample[i]
  }
  
  return(mean.sample)
  
}

### SAMPLING DISTRIBUTION OF THE SAMPLE MEAN
### RELATIONSHIPS WITH SAMPLE SIZE 

### Illustrates the Central Limit Theorem with a given 'population'
clt.population <- arg$RT
clt.population <- rexp(10000, rate=2)

# A set of sample sizes
n.obs <- 2^seq(3,11)
sim.size <- 10000

# Create a plotting grid with reasonable dimensions
ncol <- ceiling(sqrt(length(n.obs)))
nrow <- ceiling(length(n.obs) / ncol)
multiplot(nrow, ncol)

# Figure out good xlim
min.se <- sd(clt.population)/sqrt(min(n.obs))
pop.mean <- mean(clt.population)
max.xlim <- c(min(pop.mean - 5 * min.se, min(clt.population)), 
  pop.mean + 5 * min.se)

# Simulate and plot outcomes
for(i in n.obs) {
	simulateExperiment(clt.population, n.obs=i, n.sim=sim.size) -> sim.experiment
	hist(sim.experiment, 
    main=paste("n =",i), xlab=paste("skew = ", round(skew(sim.experiment),2)),
    breaks=50, xlim=max.xlim)
  print(shapiro.test(sample(sim.experiment, 5000, replace=TRUE)))
}


###
###
###

sampling.sd <- vector(length=10)

k<-1;

seq(10,500,50)->n.obs

for(i in n.obs){
	simulateExperiment(arg$RT, n.obs=i,n.sim=10000)->sim.experiment
	sd(sim.experiment)->sampling.sd[k]
	k<-k+1;
	}

plot(n.obs, sampling.sd,xlab="n.obs",ylab="sd(sample)", pch=20,cex=2)
lines(n.obs, sd(rt.plurals)/sqrt(n.obs),lwd=2,col="red")


### Normal distribution
###
###


my.norm.pdf <- function(  ) {
  # 
  #
  # ARGS:
  #
  # RETURNS:

  A <- sigma * sqrt(2*pi)
  B <- (-1/2) * (( x - mu ) / sigma)^2

  Px <- A^-1 * exp(B)

  return(...)

}

x <- seq(-5, 5, by=0.01)

plot(x, my.norm.pdf(x), type='l', lwd=2, col="red")
hist(rnorm(1e6), freq=FALSE, breaks=1e2, add=1)

# CDF
#
#

# 1
mu<-100
sigma<-10
n.obs<-10000

rnorm(n.obs, mean = mu, sd = sigma) -> norm.1
ecdf(norm.1) -> norm.1.ecdf;

seq(-5, 5) -> x.sd
min.x <- mu - 5 * sigma 
max.x <- mu + 5 * sigma

plot(x.sd, 
  norm.1.ecdf(seq(min.x, max.x, by=sigma)),
  xlab="SD",ylab="P(X<x)", type='b')

# 2
mu<-0.2
sigma<-.001

rnorm(n.obs, mean = mu, sd = sigma) -> norm.2
ecdf(norm.2) -> norm.2.ecdf;

min.x <- mu - 5 * sigma 
max.x <- mu + 5 * sigma

lines(x.sd,
  norm.2.ecdf(seq(min.x, max.x, by=sigma)),
  col="red", type='b')

# 3 
mu<-5039
sigma<-232
rnorm(n.obs, mean = mu, sd = sigma) -> norm.3
ecdf(norm.3) -> norm.3.ecdf;

min.x <- mu - 5 * sigma 
max.x <- mu + 5 * sigma

lines(x.sd,
  norm.3.ecdf(seq(min.x, max.x, by=sigma)),
  col="blue", type='b')

###

rnorm( 10000, 0, 1 ) -> norm.standard
quantile(norm.standard,c(0.005,0.025,0.05,0.165,0.5,0.835,0.95,0.975,0.995))->norm.quants
print(round(norm.quants,digits=2))

rnorm( 200, 50, 25 ) -> norm.4

se.n4 <- sd(norm.4)/sqrt(length(norm.4))

ci.95 <- c( mean(norm.4) - 2 * se.n4 ,
			mean(norm.4) + 2 * se.n4 )


######################
### T DISTRIBUTION  ##
######################

qt(0.025, df = 1:50)

rnorm( 20, 50, 25 ) -> norm.5

qt(c(0.025,0.975), df = 20 - 1) -> t.crit


### CONFIDENCE INTERVAL
se.n5 <- sd(norm.5)/sqrt(length(norm.5))

ci.95 <- mean(norm.5) + t.crit * se.n5			

####
t.test(norm.5)