# R scripts for lesson 3 # regression library(ggplot2); theme_set(theme_bw(18)) library(reshape2) library(car) # load data about flowers data(iris) help(iris) # there is a description about the dataset # always check data using commands str, head and maybe summary/psych::describe head(iris) str(iris) # Petal = okvetni listky # data are sorted, it would produce strong autocorrelation in regression iris<-iris[sample(nrow(iris)),] # first visualize all scatterplots pairs(iris) # some of the relationships looks promising # lets look, how are values distributed df.l<-melt(iris,id.vars="Species",measure.vars=1:4,value.name="value",variable.name="flower.var") # convert to long format qplot(value, data=df.l,binwidth=0.3)+facet_grid(Species~flower.var) ######################## # linear regression # ######################## ### linear regression for single variable # linear regression is fitted using command lm # formula is in form DV~IV1 # select one flower type iris.s<-iris[iris$Species=="setosa",] # first look at it graphically plot(Petal.Length~Petal.Width,iris.s) lm1<-lm(Petal.Length~Petal.Width,iris.s) # now we can compute test this model summary(lm1) # square root of R^2 is correlation sqrt(summary(lm1)$r.squared) cor(iris.s$Petal.Length,iris.s$Petal.Width) abline(lm1) # visualize this model par(mfrow=c(2,2)) plot(lm1) # test for autocorrelation should be negative durbinWatsonTest(lm1) # look for influential measures influence.measures(lm1) # add new predictor into the model lm2<-lm(Petal.Length~Petal.Width+Sepal.Width,iris.s) summary(lm2) anova(lm1,lm2) # it is not important # we can reverse order of predictors lm1.r<-lm(Petal.Length~Sepal.Width,iris.s) summary(lm1.r) # and now we get completely different results! So be careful when selecting predictors anova(lm1.r,lm2) # we can use categorical predictors lm.cat1<-lm(Petal.Length~Species,iris) summary(lm.cat1) lm.super<-lm(Petal.Length~Species*Petal.Width,iris) summary(lm.super) ############################## # graphs for presentation # ############################## x<-1:8 y<-c(3,8,5,8,12,11,18,15) lm1<-lm(y~x) plot(lm1) y.p<-predict(lm1) #y.sst<-y-mean(y) y.ssm<-y.p-mean(y) df.gof<-data.frame(x=x,y.ssr.l=pmin(y,y.p),y.ssr.h=pmax(y,y.p), y.sst.l=pmin(mean(y),y),y.sst.h=pmax(mean(y),y), y.ssm.l=pmin(mean(y),y.p),y.ssm.h=pmax(mean(y),y.p) ) p<-qplot(x,y,size=I(4)) p.ssr<-p+ stat_smooth(method="lm",se=F,size=1.4,col="black")+ geom_segment(aes(x=x,y=y.ssr.l,xend=x,yend=y.ssr.h),data=df.gof,linetype="dashed")+ coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2)) p.ssr p.sst<-p+ geom_segment(aes(x=x,y=y.sst.l,xend=x,yend=y.sst.h),data=df.gof,linetype="dashed")+ geom_segment(aes(x=x[1],xend=x[length(x)],y=mean(y),yend=mean(y)),size=1.4)+ coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2)) p.sst p.ssm<-p+ stat_smooth(method="lm",se=F,size=1.4,col="black")+ geom_segment(aes(x=x,y=y.ssm.l,xend=x,yend=y.ssm.h),data=df.gof,linetype="dashed")+ geom_segment(aes(x=x[1],xend=x[length(x)],y=mean(y),yend=mean(y)),size=1.4)+ coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2)) ggsave("images/cv2/lm_sst.png",p.sst) ggsave("images/cv2/lm_ssr.png",p.ssr) ggsave("images/cv2/lm_ssm.png",p.ssm)