# R scripts for lesson 2
# correlation and regression

library(ggplot2); theme_set(theme_bw(18))
library(reshape2)

library(car)

########################
#     correlation      #
########################

# when we are interested in relationship between two variables, we can measure correlation between them
# There are several types of correlation coefficient

### Pearson correlation coefficient
# When we have two quantitative variables, both are normally distributed, we mearure their linear relationship with Pearson correlation coefficent

# load data about flowers
data(iris)
help(iris) # there is a description about the dataset

# always check data using commands str, head and maybe summary/psych::describe
head(iris)
str(iris)

# data are sorted, it would produce strong autocorrelation in regression
iris<-iris[sample(nrow(iris)),]

# first visualize all scatterplots
pairs(iris)

# some of the relationships looks promising

# lets look, how are values distributed
df.l<-melt(iris,id.vars="Species",measure.vars=1:4,value.name="value",variable.name="flower.var") # convert to long format
qplot(value, data=df.l,binwidth=0.3)+facet_grid(Species~flower.var)

# to compute correlations in R, we can simply run command cor
df<-subset(iris,select=-Species)
c<-cor(df)
c
# when we want to publish tables, we can select lower part of cor. matrix
c[upper.tri(c,diag=T)]<-NA
c


# those correlation coeeficient are computed only for the sample, when we want to test, if they significantly differ from zero, we can use function cor.test
# it can only compute significance for one pair
cor.test(~Sepal.Length+Petal.Length,df)
# output of function cor.test is standard in R
# first there is test statistics with corresponding degrees of freedom and p.value
# in the following rows, there are alternative hypothesis, 95% CI and sample estimate (cor.coef in this case)

# when we want to compute cor.test for whole matrix, we need library Hmisc and function rcorr

library(Hmisc)
c<-rcorr(as.matrix(df))
c$P
# there are three informations in the output, in the upper part, there are correlation coefficients, in the middle part, there are sample sizes for individual comparisons, and in the lower part, there are significance values

### Spearman and Kendall correlation coefficient
# When we have small samples (<20 observations) or data are not quantitative but ordinal, we can use spearman or kendall corr instead
# it is fairly easy
cor(iris$Sepal.Length,iris$Petal.Length, method="spearman")
# or if we want to test the significance
cor.test(iris$Sepal.Length,iris$Petal.Length, method="kendall") 
# if we use spearman, we will get warning about ties
# cor.test(iris$Sepal.Length,iris$Petal.Length, method="spearman") 

### Point biserial correlation
# in R, this is easy
# on our iris data, we just need one category
df<-iris[iris$Species!="setosa",]
df$Species<-as.numeric(df$Species)
cor.test(~Sepal.Length+Species,data=df)

########################
#   linear regression  #
########################

### linear regression for single variable
# linear regression is fitted using command lm
# formula is in form DV~IV1

# select one flower type
iris.s<-iris[iris$Species=="setosa",]

# first look at it graphically 
plot(Petal.Length~Petal.Width,iris.s)

lm1<-lm(Petal.Length~Petal.Width,iris.s)
# now we can compute test this model
summary(lm1)

# square root of R^2 is correlation
sqrt(summary(lm1)$r.squared)
cor(iris.s$Petal.Length,iris.s$Petal.Width)

# visualize this model
plot(lm1)

# test for autocorrelation should be negative
durbinWatsonTest(lm1)

# look for influential measures
influence.measures(lm1)

# add new predictor into the model
lm2<-lm(Petal.Length~Petal.Width+Sepal.Width,iris.s)
summary(lm2)
anova(lm1,lm2) # it is not important

# we can reverse order of predictors
lm1.r<-lm(Petal.Length~Sepal.Width,iris.s)
summary(lm1.r)

# and now we get completely different results! So be careful when selecting predictors
anova(lm1.r,lm2)

##############################
#   graphs for presentation  #
##############################

x<-1:8
y<-c(3,8,5,8,12,11,18,15)
lm1<-lm(y~x)
plot(lm1)
y.p<-predict(lm1)
#y.sst<-y-mean(y)
y.ssm<-y.p-mean(y)

df.gof<-data.frame(x=x,y.ssr.l=pmin(y,y.p),y.ssr.h=pmax(y,y.p),
                   y.sst.l=pmin(mean(y),y),y.sst.h=pmax(mean(y),y),
                   y.ssm.l=pmin(mean(y),y.p),y.ssm.h=pmax(mean(y),y.p)
                   )

p<-qplot(x,y,size=I(4))
p.ssr<-p+
  stat_smooth(method="lm",se=F,size=1.4,col="black")+
  geom_segment(aes(x=x,y=y.ssr.l,xend=x,yend=y.ssr.h),data=df.gof,linetype="dashed")+
  coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2))
p.ssr
p.sst<-p+
  geom_segment(aes(x=x,y=y.sst.l,xend=x,yend=y.sst.h),data=df.gof,linetype="dashed")+
  geom_segment(aes(x=x[1],xend=x[length(x)],y=mean(y),yend=mean(y)),size=1.4)+
  coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2))
p.sst

p.ssm<-p+
  stat_smooth(method="lm",se=F,size=1.4,col="black")+
  geom_segment(aes(x=x,y=y.ssm.l,xend=x,yend=y.ssm.h),data=df.gof,linetype="dashed")+
  geom_segment(aes(x=x[1],xend=x[length(x)],y=mean(y),yend=mean(y)),size=1.4)+
  coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2))

ggsave("images/cv2/lm_sst.png",p.sst)
ggsave("images/cv2/lm_ssr.png",p.ssr)
ggsave("images/cv2/lm_ssm.png",p.ssm)