# R scripts for lesson 3
# regression

library(ggplot2); theme_set(theme_bw(18))
library(reshape2)

library(car)


# load data about flowers
data(iris)
help(iris) # there is a description about the dataset

# always check data using commands str, head and maybe summary/psych::describe
head(iris)
str(iris)
# Petal = okvetni listky

# data are sorted, it would produce strong autocorrelation in regression
iris<-iris[sample(nrow(iris)),]

# first visualize all scatterplots
pairs(iris)

# some of the relationships looks promising

# lets look, how are values distributed
df.l<-melt(iris,id.vars="Species",measure.vars=1:4,value.name="value",variable.name="flower.var") # convert to long format
qplot(value, data=df.l,binwidth=0.3)+facet_grid(Species~flower.var)


########################
#   linear regression  #
########################

### linear regression for single variable
# linear regression is fitted using command lm
# formula is in form DV~IV1

# select one flower type
iris.s<-iris[iris$Species=="setosa",]

# first look at it graphically 
plot(Petal.Length~Petal.Width,iris.s)

lm1<-lm(Petal.Length~Petal.Width,iris.s)
# now we can compute test this model
summary(lm1)

# square root of R^2 is correlation
sqrt(summary(lm1)$r.squared)
cor(iris.s$Petal.Length,iris.s$Petal.Width)

abline(lm1)
# visualize this model
par(mfrow=c(2,2))
plot(lm1)

# test for autocorrelation should be negative
durbinWatsonTest(lm1)

# look for influential measures
influence.measures(lm1)

# add new predictor into the model
lm2<-lm(Petal.Length~Petal.Width+Sepal.Width,iris.s)
summary(lm2)
anova(lm1,lm2) # it is not important

# we can reverse order of predictors
lm1.r<-lm(Petal.Length~Sepal.Width,iris.s)
summary(lm1.r)

# and now we get completely different results! So be careful when selecting predictors
anova(lm1.r,lm2)

# we can use categorical predictors
lm.cat1<-lm(Petal.Length~Species,iris)
summary(lm.cat1)

lm.super<-lm(Petal.Length~Species*Petal.Width,iris)
summary(lm.super)

##############################
#   graphs for presentation  #
##############################

x<-1:8
y<-c(3,8,5,8,12,11,18,15)
lm1<-lm(y~x)
plot(lm1)
y.p<-predict(lm1)
#y.sst<-y-mean(y)
y.ssm<-y.p-mean(y)

df.gof<-data.frame(x=x,y.ssr.l=pmin(y,y.p),y.ssr.h=pmax(y,y.p),
                   y.sst.l=pmin(mean(y),y),y.sst.h=pmax(mean(y),y),
                   y.ssm.l=pmin(mean(y),y.p),y.ssm.h=pmax(mean(y),y.p)
)

p<-qplot(x,y,size=I(4))
p.ssr<-p+
  stat_smooth(method="lm",se=F,size=1.4,col="black")+
  geom_segment(aes(x=x,y=y.ssr.l,xend=x,yend=y.ssr.h),data=df.gof,linetype="dashed")+
  coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2))
p.ssr
p.sst<-p+
  geom_segment(aes(x=x,y=y.sst.l,xend=x,yend=y.sst.h),data=df.gof,linetype="dashed")+
  geom_segment(aes(x=x[1],xend=x[length(x)],y=mean(y),yend=mean(y)),size=1.4)+
  coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2))
p.sst

p.ssm<-p+
  stat_smooth(method="lm",se=F,size=1.4,col="black")+
  geom_segment(aes(x=x,y=y.ssm.l,xend=x,yend=y.ssm.h),data=df.gof,linetype="dashed")+
  geom_segment(aes(x=x[1],xend=x[length(x)],y=mean(y),yend=mean(y)),size=1.4)+
  coord_fixed(ratio=0.5)+scale_x_continuous(breaks=x)+scale_y_continuous(breaks=seq(2,18,2))

ggsave("images/cv2/lm_sst.png",p.sst)
ggsave("images/cv2/lm_ssr.png",p.ssr)
ggsave("images/cv2/lm_ssm.png",p.ssm)