x1 <- rnorm(100,10,10) par(mfrow=c(1, 3)) plot(density(x1)); polygon(density(x1), col="gray", border="black") x2 <- rbinom(100, 1, .5) sum(x2) hist(x2, col="gray") y <- x1 + x2*40 + rnorm(100, 0, 10) plot(density(y)); polygon(density(y), col="red", border="black") par(mfrow=c(1, 1)) reg2 <- lm(y~x2) summary(reg2) y.x2 <- data.frame(y,x2) split.y.x2 <- split(y.x2, x2) y.0 <- split.y.x2$"0" y.1 <- split.y.x2$"1" par(mfrow=c(1, 2)) plot(as.numeric(row.names(y.0)), y.0$y, pch=20, col="blue", xlim=c(0,100), ylim=range(y)) abline(mean(y.0$y),0, col="blue") points(as.numeric(row.names(y.1)), y.1$y, pch=20, col="red") abline(mean(y.1$y),0, col="red") plot(as.factor(x2), y, col="lightblue", ylim=range(y)) abline(mean(y.0$y),0, col="blue") abline(mean(y.1$y),0, col="red") par(mfrow=c(1, 1)) residuals.0 <- (y.0$y-mean(y.0$y)) residuals.1 <- (y.1$y-mean(y.1$y)) # ANOVA (Analysis of Variance) # With anova, we can compare the 2-mean model with the 1-mean model # That is, we can answer the question -- is the 2-mean model better than the 1-mean model, i.e., does keeping track of the variable x2 significantly reduce the error? anova(reg2) squared.error.2MEAN <- sum(residuals.0^2)+sum(residuals.1^2) # a.k.a. squared error within (groups) round(squared.error.2MEAN) squared.error.1MEAN <- sum((y-mean(y))^2) # a.k.a. total squared error round(squared.error.1MEAN) squared.error.difference <- squared.error.1MEAN-squared.error.2MEAN round(squared.error.difference) # a.k.a. squared error between (groups) # This is the part of the total error that is accounted for / eliminated by grouping, i.e., by keeping track of the variable x2 mean.squared.error.within <- squared.error.2MEAN/(length(y)-2) round(mean.squared.error.within) mean.squared.error.between <- squared.error.difference/(2-1) round(mean.squared.error.between) # If the variable x2 does not have any effect, i.e., if there aren't actually 2 groups in the sample but only 1, the between-groups error should be the same as the within-group error: # -- H_0: mean.squared.error.between = mean.squared.error.within # In ratio form: # -- H_0: mean.squared.error.between / mean.squared.error.within = 1 # This ratio is our test statistic, namely the F-value in the anova output round(mean.squared.error.between/mean.squared.error.within, 2) # An F-distribution with 1 and 98 degrees of freedom plot(density(rf(100000, 1, 98)), xlim=range(0, 7), main="An F-distribution with 1 and 98 degrees of freedom") 1-pf(mean.squared.error.between/mean.squared.error.within, 1, 98) anova(reg2) # A good intro to & discussion of the notion of degrees of freedom is available here: # http://courses.ncssm.edu/math/Stat_Inst/PDFS/DFWalker.pdf # THIRD ATTEMPT -- many means, i.e., predicting y values based on x1 values # "many means" is just a pedagogical aid; making the continuous variable x1 categorical (with whatever number of bins) is usually not a good idea; more about this later plot(x1, y, pch=20, col="blue") # What to look for in a plot: # -- direction of the cloud of points # -- form / shape: linear, curving once, curving twice etc. # -- amount of scatter # -- outliers # Standardized plot: center both variables at their respective means and rescale variables by their respective standard deviations. # This centers the plot at the origin and ensures that the two axes have the same unit of measurement. plot((x1-mean(x1))/sd(x1), (y-mean(y))/sd(y), pch=20, col="blue", xlab="standardized x1", ylab="standardized y") abline(v=0, col="lightblue"); abline(h=0, col="lightblue") par(mfrow=c(1, 2)) plot(x1, y, pch=20, col="blue", xlim=range(c(x1, -x1)), ylim=range(c(y, -y))) plot((x1-mean(x1))/sd(x1), (y-mean(y))/sd(y), pch=20, col="blue", xlab="standardized x1", ylab="standardized y", xlim=range(c(x1, -x1)), ylim=range(c(y, -y))) abline(v=0, col="lightblue"); abline(h=0, col="lightblue") par(mfrow=c(1, 1)) # Another example: standardizing the CHILE plot webreg <- "http://www.sagepub.co.uk//wrightandlondon//" chile <- read.table(paste(webreg,"chile.dat",sep=""), header=T) attach(chile) names(chile) par(mfrow=c(1, 2)) plot(LENGTH, HEAT, pch=20, col="blue", xlim=range(c(LENGTH, -LENGTH)), ylim=range(c(HEAT, -HEAT))) abline(v=0, col="lightblue"); abline(h=0, col="lightblue") plot((LENGTH-mean(LENGTH))/sd(LENGTH), (HEAT-mean(HEAT))/sd(HEAT), pch=20, col="blue", xlab="standardized LENGTH", ylab="standardized HEAT", xlim=range(c(LENGTH, -LENGTH)), ylim=range(c(HEAT, -HEAT))) abline(v=0, col="lightblue"); abline(h=0, col="lightblue") par(mfrow=c(1, 1)) detach(chile) # Back to our regression: y~x1 reg1 <- lm(y~x1) summary(reg1) plot(x1, y, pch=20, col="blue") abline(reg1) segments(x1, y, x1, reg1$fitted.values, col="lightblue") abline(mean(y),0, col="darkred") # Comparing reg1 with the 1-mean model anova(reg1) # Note that the p-value for the x1 coefficient (i.e., the slope of the regression line) is the same in the anova and the regression output # This value indicates whether the slope value is statistically significant, i.e., whether the model that keeps track of the x1 values is better than the simplest 1-mean model squared.error.reg1 <- round(sum((summary(reg1)$residuals)^2)) # a.k.a. squared error within (i.e., within the model / not accounted for by the model) squared.error.reg1 squared.error.1MEAN <- sum((y-mean(y))^2) # a.k.a. total squared error round(squared.error.1MEAN) squared.error.difference <- squared.error.1MEAN-squared.error.reg1 round(squared.error.difference) # a.k.a. squared error between -- this is the part of the total error that is accounted for / eliminated by grouping, i.e., by keeping track of the variable x1 mean.squared.error.within <- squared.error.reg1/(length(y)-2) round(mean.squared.error.within) mean.squared.error.between <- squared.error.difference/(2-1) round(mean.squared.error.between) # If the variable x1 does not have any effect, the between error should be the same as the within error: # -- H_0: mean.squared.error.between = mean.squared.error.within # In ratio form: # -- H_0: mean.squared.error.between / mean.squared.error.within = 1 # This ratio is our test statistic, namely the F-value in the anova output round(mean.squared.error.between/mean.squared.error.within, 2) # An F-distribution with 1 and 98 degrees of freedom plot(density(rf(100000, 1, 98)), xlim=range(0, 7), main="An F-distribution with 1 and 98 degrees of freedom") 1-pf(mean.squared.error.between/mean.squared.error.within, 1, 98) anova(reg1) # The coefficients for the intercept and slope (i.e., for the x1 variable) minimize the squared error, just like the mean does. summary(reg1)$coefficients coef(reg1) (reg1.slope <- summary(reg1)$coefficients[2, 1]) (reg1.intercept <- summary(reg1)$coefficients[1, 1]) # Alternative slopes (alternative.slopes <- seq(reg1.slope-2, reg1.slope+2, length.out=49)) alternative.slopes[25] == reg1.slope (errors.for.alternatives <- numeric(length=49)) for (i in 1:49) { errors.for.alternatives[i] <- sum((y-(reg1.intercept+(alternative.slopes[i]*x1)))^2) } errors.for.alternatives errors.for.alternatives[25] == sum((summary(reg1)$residuals)^2) min(errors.for.alternatives) == sum((summary(reg1)$residuals)^2) plot(alternative.slopes, errors.for.alternatives, type="l", ylab="sum of squared residuals", xlab="slopes") points(reg1.slope, sum((summary(reg1)$residuals)^2), pch=20, col="blue") # Alternative intercepts (alternative.intercepts <- seq(reg1.intercept-30, reg1.intercept+30, length.out=49)) alternative.intercepts[25] == reg1.intercept (errors.for.alternatives <- numeric(length=49)) for (i in 1:49) { errors.for.alternatives[i] <- sum((y-(alternative.intercepts[i]+(reg1.slope*x1)))^2) } errors.for.alternatives errors.for.alternatives[25] == sum((summary(reg1)$residuals)^2) min(errors.for.alternatives) == sum((summary(reg1)$residuals)^2) plot(alternative.intercepts, errors.for.alternatives, type="l", ylab="sum of squared residuals", xlab="intercepts") points(reg1.intercept, sum((summary(reg1)$residuals)^2), pch=20, col="blue") # Alternative intercepts and slopes (alternative.slopes <- seq(reg1.slope-2, reg1.slope+2, length.out=49)) (alternative.intercepts <- seq(reg1.intercept-30, reg1.intercept+30, length.out=49)) alt.sl <- rep(alternative.slopes, each=49) alt.int <- rep(alternative.intercepts, 49) errors.for.alternatives <- numeric(length=49*49) for (i in 1:(49*49)) { errors.for.alternatives[i] <- sum((y-(alt.int[i]+(alt.sl[i]*x1)))^2) } min(errors.for.alternatives) == sum((summary(reg1)$residuals)^2) #install.packages("scatterplot3d") library("scatterplot3d") par(mfrow=c(1, 1)) (s3d <- scatterplot3d(alt.sl, alt.int, errors.for.alternatives, highlight.3d=TRUE, col.axis="blue", col.grid="lightblue", pch=20, type = "p", angle = 5)) s3d$points3d(reg1.slope, reg1.intercept, sum((summary(reg1)$residuals)^2), col="green", type="h", pch=20) (s3d <- scatterplot3d(alt.sl, alt.int, errors.for.alternatives, highlight.3d=TRUE, col.axis="blue", col.grid="lightblue", pch=20, type = "p", angle = 120)) s3d$points3d(reg1.slope, reg1.intercept, sum((summary(reg1)$residuals)^2), col="green", type="h", pch=20) (s3d <- scatterplot3d(alt.sl, alt.int, errors.for.alternatives, highlight.3d=TRUE, col.axis="blue", col.grid="lightblue", pch=20, type = "p", angle = 175)) s3d$points3d(reg1.slope, reg1.intercept, sum((summary(reg1)$residuals)^2), col="green", type="h", pch=20) (s3d <- scatterplot3d(alt.sl, alt.int, errors.for.alternatives, highlight.3d=TRUE, col.axis="blue", col.grid="lightblue", pch=" ", lwd = 2, type = "h", lty.hplot=1, angle = 45)) s3d$points3d(reg1.slope, reg1.intercept, sum((summary(reg1)$residuals)^2), col="green", type="p", pch=20)