#Tip: To run any line in RStudio you can use Ctrl+Enter or Command+Enter. #To use any package in R, we need to load its library first.To run DESeq2 #you just need to type the following. library("DESeq2") source("/project/umw_biocore/class/funcs.R") #1. Reading the data. # We already made gene quantifications and merged expected counts into # a single table. Here we prepared another table using whole reads # (not only using reduced reads) to give you an idea about the complete # picture of DESeq analysis. file <- "/project/umw_biocore/class/data.tsv" #1. Read the data with row.names. We have the gene names in the # first column in this table. When we set row.names=1 It will remove # the first column from the data and put them to the row name section. rsem <- read.table(file,sep="\t", header=TRUE, row.names=1) #2. Creating the data structure for DESeq Analysis, we need to select # the columns that we are going to use in the analysis. Here we are # going to use experiment and control data that will be 6 columns. # Here in this step make sure to include the columns you want to use in # your analysis. columns <- c("exper_rep1","exper_rep2","exper_rep3", "control_rep1","control_rep2","control_rep3") data <- data.frame(rsem[, columns]) #3. Scatter plots are a key approach to assess differences between conditions. # We plot the average expression value (counts or TPMs for example) of # two conditions.First calculate the average reads per gene. To find the # average we sum experiments per gene and divide it to the number of replicas # In this case we will divide the sum to 3. We will calculate the average for # control libraries too. After that, we are ready to merge the average values # from experiment and control to create a two column data structure using the # cbind function that merges tables. avgall<-cbind(rowSums(data[c("exper_rep1","exper_rep2","exper_rep3")])/3, rowSums(data[c("control_rep1","control_rep2","control_rep3")])/3) # We can also change the column names using colnames function. colnames(avgall)<-c("Treat", "Control") #4.There are several packages in R that can be used to # generate scatter plots. The same plot can be generated using # the ggplot package. gdat<-data.frame(avgall) ggplot() + geom_point(data=gdat, aes_string(x="Treat", y="Control"), colour="black", alpha=6/10, size=3) + scale_x_log10() +scale_y_log10() #5. In Eukaryotes only a subset of all genes are expressed in # a given cell. Expression is therefore a bimodal distribution, # with non-expressed genes having counts that result from experimental # and biological noise. It is important to filter out the genes # that are not expressed before doing differential gene expression. # You can decide which cutoff separates expressed vs non-expressed # genes by looking your histogram we created. # In our case a total sum of 10 counts separates well expressed # from non-expressed genes sumd <- rowSums(data) hist(log10(sumd), breaks=100) abline(v=1) #To make all to all scater plot, use the function below all2all(data) ###################################### ## LETS START DESeq ANALYSIS ###################################### # #6. The goal of Differential gene expression analysis is to find # genes or transcripts whose difference in expression, when accounting # for the variance within condition, is higher than expected by chance. # The first step is to indicate the condition that each column (experiment) # in the table represent. # Here we define the correspondence between columns and conditions. # Make sure the order of the columns matches to your table. columns conds <- factor( c("Control","Control", "Control", "Treat", "Treat","Treat") ) # DESeq will compute the probability that a gene is differentially # expressed (DE) for ALL genes in the table. It outputs both a # nominal and a multiple hypothesis corrected p-value (padj). # Because we are testing DE for over 20,000 genes, we do need to correct # for multiple hypothesis testing. To find genes that are significantly # DE, we select the ones has lower padj values and higher fold changes and # visualize them on our scatter plot with different color. # padj values are corrected p-values which are multiplied by the number # of comparisons. Here we are going to use 0.01 for padj value # and > 1 log2foldchange. (1/2 < foldChange < 2) de_res <- runDESeq(data, columns, conds, padj=0.01, log2FoldChange=1, non_expressed_cutoff=10) overlaid_data <- overlaySig(gdat, de_res$res_selected) #7. Make ggplot and overlay the significantly DE genes in the scatter plot ggplot() + geom_point(data=overlaid_data, aes_string(x="Treat", y="Control", colour="Legend"), alpha=6/10, size=3) + scale_colour_manual(values=c("All"="darkgrey","Significant"="red"))+ scale_x_log10() +scale_y_log10() #8. The Second way to visualize it, we use MA plots. # For MA Plot there is another builtin function that you can use. plotMA(de_res$res_detected,ylim=c(-2,2),main="DESeq2"); #9. The third way of visualizing the data is making a Volcano Plot. # Here on the x axis you have log2foldChange values and y axis you # have your -log10 padj values. To see how significant genes are # distributed. Highlight genes that have an absolute fold change > 2 # and a padj < 0.01 volcanoPlot(de_res, padj=0.01, log2FoldChange=1) #10. The forth way of visualizing the data that is widely used in this # type of analysis is clustering and Heatmaps. sel_data<-data[rownames(de_res$res_selected),] #a. Normalizing the selected data using TMM norm_data<-getNormalizedMatrix(sel_data, method="TMM") # Here we usually add a pseudocount value 0.1 in this case. ld <- log2(norm_data+0.1) #Alternative: Use the selected data without normalization and see the difference in the heatmap # (Hint: skip the line below to use normalized data) ld <- log2(sel_data+0.1) # b. Scaling the value using their mean centers can be good it the data is # uniformely distributed in all the samples. cldt <- scale(t(ld), center=TRUE, scale=TRUE); # We can define different distance methods to calculate the distance # between samples. Here we focus on euclidean distance and correlation #c. Euclidean distance distance<-dist(cldt, method = "euclidean") # To plot only the cluster you can use the command below plot(hclust(distance, method = "complete"), main="Euclidean", xlab="") #The heatmap cld <- t(cldt) heatmap.2(cld, Rowv=TRUE,dendrogram="column", Colv=TRUE, col=redblue(256),labRow=NA, density.info="none",trace="none", cexCol=0.8); #d. Correlation between libraries #Here we calculate the correlation between samples. dissimilarity <- 1 - cor(cld) # We define it as distance. This will create a square matrix that # will include the dissimilarites between samples. distance <- as.dist(dissimilarity) # To plot only the cluster you can use the command below plot(hclust(distance, method = "complete"), main="1-cor", xlab="") #The heatmap heatmap.2(cld, Rowv=TRUE,dendrogram="column", Colv=TRUE, col=redblue(256),labRow=NA, density.info="none",trace="none", cexCol=0.8, hclust=function(x) hclust(x,method="complete"), distfun=function(x) as.dist((1-cor(t(x)))/2))