######################################## ############ PROBLEM #1 ############ ######################################## ### Load the GAM library library(gam) ### Load data from website x<-read.csv("http://people.ucsc.edu/~aspearot/Econ_217/Wages1983.csv") ### Clean the Data x<-subset(x,is.na(feduc)==FALSE) x<-subset(x,is.na(meduc)==FALSE) ### Run the GAM model gamfit<-gam(log(wage)~s(educ)+s(exper)+s(feduc)+s(meduc),x,family="gaussian") ### Run the GAM model par(mfrow=c(2,2)) ### inititalize the 2X2 plot plot(gamfit,terms="s(educ)",se=TRUE,rug=FALSE, main="log(wage)~s(educ)") ### plot the smooth term in education abline(h=0) ### Add a horizontal line at zero abline(v=12.7) ### Add in important points in the figure (this was done by trial and error) abline(v=14.1) plot(gamfit,terms="s(exper)",se=TRUE,rug=FALSE, main="log(wage)~s(exper)") ### plot the smooth term in experience abline(h=0) ### Add a horizontal line at zero abline(v=10.5) ### Add in important points in the figure (this was done by trial and error) abline(v=13) plot(gamfit,terms="s(feduc)",se=TRUE,rug=FALSE, main="log(wage)~s(feduc)") ### plot the smooth term in father's education abline(h=0) ### Add a horizontal line at zero abline(v=4.0) ### Add in important points in the figure (this was done by trial and error) abline(v=7.4) plot(gamfit,terms="s(meduc)",se=TRUE,rug=FALSE, main="log(wage)~s(meduc)") ### plot the smooth term in father's education abline(h=0) ### Add a horizontal line at zero abline(v=6) ### Add in important points in the figure (this was done by trial and error) abline(v=7.8) abline(v=12) ######################################## ############ PROBLEM #2A ########### ######################################## ### In the problem you use the dataset provided. For this answer, I will ### create another fake dataset like the random datasets for the problem x<-seq(0,10,length.out=100) ### Create a uniform support of x's cutoff<-runif(1,2,8) ### Randomly choose a kink b<-runif(1,4,5) ### Randomly generate a coefficient on the kink u<-rnorm(length(x),0,2) ### Create a random vector of noise y<-x+b*ifelse(x>cutoff,x-cutoff,0)+u ### Generate outcome variable #### Find the optimal Kink d<-data.frame(y,x) plot(y~x,d) hs<-seq(1,9,0.1) ### Create a sequence of test points for(h in 1:length(hs)){ ### iterate through all possible kinks defined by hs for(i in 1:nrow(d)){ ### iterate through all rows in the dataset d_drop<-d[i,] ### create a dataset with the dropped observation d_keep<-d[-i,] ### define data frame with the remaining observations fit<-lm(y~x+ifelse(x>hs[h],x-hs[h],0),d_keep) ### run at this particular kink guess dropfit<-predict(fit,d_drop,se=FALSE) ### Predict fit of the dropped observation sqrerr<-(d_drop$y-as.numeric(dropfit))^2 ### calculate squarred error of dropped observation relative to fit if(i*h==1){results<-data.frame(hs[h],i,sqrerr)} ### Save results if(i*h>1){results<-rbind(results,data.frame(hs[h],i,sqrerr))} } } SSRs<-sort(tapply(results$sqrerr,results$h,FUN=sum,na.rm=TRUE)) ### sort SSRs, minimum first cutoff_hat<-as.numeric(names(SSRs[1])) ### identify the cutoff values from the labels ######################################## ############ PROBLEM #2B ########### ######################################## ### Create a function to randomly sample from a data frame (of course this can be done later, but it's easier with a function) randomSample = function(df,n) { return (df[sample(nrow(df), n,replace=TRUE),]) } ### Run 20 bootstrap iterations for(r in 1:20){ ### Iterate through replications sd<-randomSample(d,nrow(d)) ### generate random sample for(h in 1:length(hs)){ ### Iterate through possible kink points for(i in 1:nrow(sd)){ ### Iterate through rows of the replicated sample d_drop<-sd[i,] ### create a dataset with the dropped observation d_keep<-sd[-i,] ### define data frame with the remaining observations fit<-lm(y~x+ifelse(x>hs[h],x-hs[h],0),d_keep) ### run at this particular kink guess dropfit<-predict(fit,d_drop,se=FALSE) ### Predict fit of the dropped observation sqrerr<-(d_drop$y-as.numeric(dropfit))^2 ### calculate squarred error of dropped observation relative to fit if(i*h==1){results<-data.frame(hs[h],i,sqrerr)} ### Save results if(i*h>1){results<-rbind(results,data.frame(hs[h],i,sqrerr))} } } SSRs<-sort(tapply(results$sqrerr,results$h,FUN=sum,na.rm=TRUE)) ### sort SSRs, minimum first h_hat<-as.numeric(names(SSRs[1])) ### identify the cutoff values from the labels if(r==1){results_h<-data.frame(r,h_hat)} ### Save Results if(r>1){results_h<-rbind(results_h,data.frame(r,h_hat))} print(c(r,h_hat)) } confid_int<-quantile(results_h$h_hat,prob=c(0.05,0.95),na.rm=TRUE) ### For the 90% confidence interval, get the 5th and 95th percentiles print(confid_int)