#Tip: To run any line in RStudio you can use Ctrl+Enter or Command+Enter.
#To use any package in R, we need to load its library first.To run DESeq2 
#you just need to type the following.
library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Loading required package: Rcpp
## Loading required package: RcppArmadillo
#We are going to use other libraries and lets load them too.
library("ggplot2")
library("RColorBrewer")
library("gplots")
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:IRanges':
## 
##     space
## 
## The following object is masked from 'package:stats':
## 
##     lowess
source("/project/umw_biocore/class/funcs.R")

#1. Reading the data.
# We already made gene quantifications and merged expected counts into 
# a single table. Here we prepared another table using whole reads 
# (not only reduced reads) to give you an idea about the complete 
# picture of DESeq analysis.

file <- "/project/umw_biocore/class/data.tsv"

#1. Read the data with row.names. We have the gene names in the 
# first column in this table. When we set row.names=1 It will remove 
# the first column from the data and put them to the row name section.

rsem <- read.table(file,sep="\t", header=TRUE, row.names=1)

#2. Creating the data structure for DESeq Analysis, we need to select 
# the columns that we are going to use in the analysis. Here we are 
# going to use experiment and control data that will be 6 coulmns. 
# Here in this step make sure to include the columns you want to use in 
# your analysis.
columns <- c("exper_rep1","exper_rep2","exper_rep3",
            "control_rep1","control_rep2","control_rep3")

data <- data.frame(rsem[, columns])


#3. Scatter plots are a key approach to assess differences between conditions. 
# We plot the average expression value (counts or TPMs for example) of 
# two conditions.First calculate the average reads per gene. To find the 
# average we sum experiments per gene and divide it to the number of replicas

# In this case we will divide the sum to 3. We will calculate the average for 
# control libraries too. After that, we are ready to merge  the average values 
# from experiment and control to create a two column  data structure using the 
# cbind function that merges tables.

avgall<-cbind(rowSums(data[c("exper_rep1","exper_rep2","exper_rep3")])/3, 
              rowSums(data[c("control_rep1","control_rep2","control_rep3")])/3)

# We can also change the column names using colnames function.

colnames(avgall)<-c("Treat", "Control")


#4.There are several packages in R that can be used to 
# generate scatter plots. The same plot can be generated using 
# the ggplot package.

gdat<-data.frame(avgall)
ggplot() +
  geom_point(data=gdat, aes_string(x="Treat", y="Control"),
             colour="black", alpha=6/10, size=3) +
  scale_x_log10() +scale_y_log10()

#5. In Eukaryotes only a subset of all genes are expressed in 
# a given cell. Expression is therefore a bimodal distribution, 
# with non-expressed genes having counts that result from experimental 
# and biological noise. It is important to Filter out the genes 
# that are not expressed before doing differential gene expression. 

# You can decide which cutoff separates expressed vs non-expressed 
# genes by looking your histogram we created.

# In our case a total sum of 10 counts separates well expressed 
# from non-expressed genes

sumd <- rowSums(data)
hist(log10(sumd), breaks=100)
abline(v=1)

all2all(data)
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties
## Warning in cor.test.default(x, y, method = "spearman", na.rm = T): Cannot
## compute exact p-value with ties

######################################
## LETS START DESeq ANALYSIS
######################################
#
#6. The goal of Differential gene expression analysis is to find 
# genes or transcripts whose difference in expression, when accounting 
# for the variance within condition, is higher than expected by chance. 

# The first step is to indicate the condition that each column (experiment) 
# in the table represent. 

# Here we define the correspondence between columns and conditions. 
# Make sure the order of the columns matches to your table.

columns
## [1] "exper_rep1"   "exper_rep2"   "exper_rep3"   "control_rep1"
## [5] "control_rep2" "control_rep3"
conds <- factor( c("Control","Control", "Control",
                   "Treat", "Treat","Treat") )


# DESeq will compute the probability that a gene is differentially 
# expressed (DE) for ALL genes in the table. It outputs both a 
# nominal and a multiple hypothesis corrected p-value (padj). 
# Because we are testing DE for over 20,000 genes, we do need to correct 
# for multiple hypothesis testing. To find genes that are significantly 
# DE select the ones has lower padj values. # higher fold changes and 
# visualize them on our scatter plot with different  # color. 
# padj values are corrected p-values which are multiplied by the number 
# of comparisons. Here we are going to use 0.01 for padj value 
# and > 1 log2foldchange.  (1/2 < foldChange < 2)

de_res <- runDESeq(data, columns, conds,  padj=0.01, log2FoldChange=1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
overlaid_data <- overlaySig(gdat, de_res$res_selected)

#7. Make ggplot

ggplot() +
  geom_point(data=overlaid_data, aes_string(x="Treat", y="Control",
                                    colour="Legend"), alpha=6/10, size=3) +
  scale_colour_manual(values=c("All"="darkgrey","Significant"="red"))+
  scale_x_log10() +scale_y_log10()

#8. The Second way to visualize it we use MA plots.
# For MA Plot there is another builtin function that you can use.
plotMA(de_res$res_detected,ylim=c(-2,2),main="DESeq2");

#9. The third way of visualizing the data is making a Volcano Plot.
# Here on the x axis you have log2foldChange values and y axis you 
# have your -log10 padj values. To see how significant genes are 
# distributed. Highlight genes that have an absolute fold change > 2 
# and a padj < 0.01

volcanoPlot(de_res,  padj=0.01, log2FoldChange=1)
## Warning: Removed 1410 rows containing missing values (geom_point).

#10. The forth way of visualizing the data that is widely used in this 
# type of analysis is clustering and Heatmaps.
# Here we usually add a pseudocount value 0.1 in this case.

ld <- log2(data[rownames(de_res$res_selected),]+0.1)
# Scaling the value using their mean centers can be good it the data is 
# uniformely distributed in all the samples.

cldt <- scale(t(ld), center=TRUE, scale=TRUE);
cld <- t(cldt)

# We can define different distance methods to calculate the distance
# between samples. Here we focus on euclidean distance and correlation
#a. Euclidean distance

distance<-dist(cldt, method = "euclidean")

# To plot only the cluster you can use the command below
plot(hclust(distance, method = "complete"),
     main="Euclidean", xlab="")

#The heatmap

heatmap.2(cld, Rowv=TRUE,dendrogram="column",
          Colv=TRUE, col=redblue(256),labRow=NA,
          density.info="none",trace="none", cexCol=0.8);

#b. Correlation between libraries
#Here we calculate the correlation between samples.
dissimilarity <- 1 - cor(cld)
# We define it as distance. This will create a square matrix that
# will include the dissimilarites between samples.
distance <- as.dist(dissimilarity)

# To plot only the cluster you can use the command below

plot(hclust(distance, method = "complete"),
     main="1-cor", xlab="")

#The heatmap
heatmap.2(cld, Rowv=TRUE,dendrogram="column",
          Colv=TRUE, col=redblue(256),labRow=NA,
          density.info="none",trace="none", cexCol=0.8,
          hclust=function(x) hclust(x,method="complete"),
          distfun=function(x) as.dist((1-cor(t(x)))/2))