# R scripts for lesson 2 # correlation and regression library(ggplot2); theme_set(theme_bw(18)) library(reshape2) library(car) ######################## # correlation # ######################## # when we are interested in relationship between two variables, we can measure correlation between them # There are several types of correlation coefficient ### Pearson correlation coefficient # When we have two quantitative variables, both are normally distributed, we mearure their linear relationship with Pearson correlation coefficent # load data about flowers data(iris) help(iris) # there is a description about the dataset # always check data using commands str, head and maybe summary/psych::describe head(iris) str(iris) # data are sorted, it would produce strong autocorrelation in regression iris<-iris[sample(nrow(iris)),] # first visualize all scatterplots pairs(iris) # some of the relationships looks promising # lets look, how are values distributed df.l<-melt(iris,id.vars="Species",measure.vars=1:4,value.name="value",variable.name="flower.var") # convert to long format qplot(value, data=df.l,binwidth=0.3)+facet_grid(Species~flower.var) # to compute correlations in R, we can simply run command cor df<-subset(iris,select=-Species) c<-cor(df) c # when we want to publish tables, we can select lower part of cor. matrix c[upper.tri(c,diag=T)]<-NA c # those correlation coeeficient are computed only for the sample, when we want to test, if they significantly differ from zero, we can use function cor.test # it can only compute significance for one pair cor.test(~Sepal.Length+Petal.Length,df) # output of function cor.test is standard in R # first there is test statistics with corresponding degrees of freedom and p.value # in the following rows, there are alternative hypothesis, 95% CI and sample estimate (cor.coef in this case) # when we want to compute cor.test for whole matrix, we need library Hmisc and function rcorr library(Hmisc) c<-rcorr(as.matrix(df)) c$P # there are three informations in the output, in the upper part, there are correlation coefficients, in the middle part, there are sample sizes for individual comparisons, and in the lower part, there are significance values ### Spearman and Kendall correlation coefficient # When we have small samples (<20 observations) or data are not quantitative but ordinal, we can use spearman or kendall corr instead # it is fairly easy cor(iris$Sepal.Length,iris$Petal.Length, method="spearman") # or if we want to test the significance cor.test(iris$Sepal.Length,iris$Petal.Length, method="kendall") # if we use spearman, we will get warning about ties # cor.test(iris$Sepal.Length,iris$Petal.Length, method="spearman") ### Point biserial correlation # in R, this is easy # on our iris data, we just need one category df<-iris[iris$Species!="setosa",] df$Species<-as.numeric(df$Species) cor.test(~Sepal.Length+Species,data=df) ######################## # linear regression # ######################## ### linear regression for single variable # linear regression is fitted using command lm # formula is in form DV~IV1 # select one flower type iris.s<-iris[iris$Species=="setosa",] # first look at it graphically plot(Petal.Length~Petal.Width,iris.s) lm1<-lm(Petal.Length~Petal.Width,iris.s) # now we can compute test this model summary(lm1) # square root of R^2 is correlation sqrt(summary(lm1)$r.squared) cor(iris.s$Petal.Length,iris.s$Petal.Width) # visualize this model plot(lm1) # test for autocorrelation should be negative durbinWatsonTest(lm1) # look for influential measures influence.measures(lm1) # add new predictor into the model lm2<-lm(Petal.Length~Petal.Width+Sepal.Width,iris.s) summary(lm2) anova(lm1,lm2) # it is not important # we can reverse order of predictors lm1.r<-lm(Petal.Length~Sepal.Width,iris.s) summary(lm1.r) # and now we get completely different results! So be careful when selecting predictors anova(lm1.r,lm2) ############################## # graphs for presentation # ############################## x<-1:8 y<-c(3,8,5,8,12,11,18,15) lm1<-lm(y~x) plot(lm1) y.p<-predict(lm1) #y.sst<-y-mean(y) y.ssm<-y.p-mean(y) df.gof<-data.frame(x=x,y.ssr.l=pmin(y,y.p),y.ssr.h=pmax(y,y.p), y.sst.l=pmin(mean(y),y),y.sst.h=pmax(mean(y),y), y.ssm.l=pmin(mean(y),y.p),y.ssm.h=pmax(mean(y),y.p) ) p<-qplot(x,y,size=I(4)) p.ssr<-p+ stat_smooth(method="lm",se=F,size=1.4,col="black")+ geom_segment(aes(x=x,y=y.ssr.l,xend=x,yend=y.ssr.h),data=df.gof,linetype="dashed")+ coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2)) p.ssr p.sst<-p+ geom_segment(aes(x=x,y=y.sst.l,xend=x,yend=y.sst.h),data=df.gof,linetype="dashed")+ geom_segment(aes(x=x[1],xend=x[length(x)],y=mean(y),yend=mean(y)),size=1.4)+ coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2)) p.sst p.ssm<-p+ stat_smooth(method="lm",se=F,size=1.4,col="black")+ geom_segment(aes(x=x,y=y.ssm.l,xend=x,yend=y.ssm.h),data=df.gof,linetype="dashed")+ geom_segment(aes(x=x[1],xend=x[length(x)],y=mean(y),yend=mean(y)),size=1.4)+ coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2)) ggsave("images/cv2/lm_sst.png",p.sst) ggsave("images/cv2/lm_ssr.png",p.ssr) ggsave("images/cv2/lm_ssm.png",p.ssm)