###########################################################
#                                                         #
#   AprendeBioinformaticaEnCasa (06-04-2020)              #
#                                                         #
#   https://bioinfogp.cnb.csic.es/courses/quedateencasa   #
#   twitch.tv/bioinfogp                                   #
#                                                         #
###########################################################

# Differential gene expression analysis (RNA-seq) with R

# Juan Carlos Oliveros, BioinfoGP, CNB-CSIC
# oliveros@cnb.csic.es


# "Physiological responses of Saccharomyces cerevisiae to industrially
# relevant conditions: Slow growth, low pH, and high CO2 levels"
# Hakkaart X, Liu Y, Hulst M, El Masoudi A, Peuscher E, Pronk J, van Gulik W, Daran-Lapujade P.
# Biotechnol Bioeng. 2020 Mar;117(3):721-735. doi: 10.1002/bit.27210. Epub 2020 Jan 22.

# https://www.ncbi.nlm.nih.gov/pubmed/31654410

# SampleID      Description               Replicate
# SRR9336468    Chemostat pH5 0.04% CO2   R1
# SRR9336469    Chemostat pH5 0.04% CO2   R2
# SRR9336470    Chemostat pH5 0.04% CO2   R3
# SRR9336471    Chemostat pH5 50% CO2     R1
# SRR9336472    Chemostat pH5 50% CO2     R2
# SRR9336473    Chemostat pH5 50% CO2     R3
# SRR9336474    Chemostat pH3 0.04% CO2   R1
# SRR9336475    Chemostat pH3 0.04% CO2   R2
# SRR9336476    Chemostat pH3 0.04% CO2   R3

# original FASTQ files (HUGE!!!)
# https://www.ebi.ac.uk/ena/data/view/SRR9336468
# https://www.ebi.ac.uk/ena/data/view/SRR9336469
# https://www.ebi.ac.uk/ena/data/view/SRR9336470
# https://www.ebi.ac.uk/ena/data/view/SRR9336471
# https://www.ebi.ac.uk/ena/data/view/SRR9336472
# https://www.ebi.ac.uk/ena/data/view/SRR9336473
# https://www.ebi.ac.uk/ena/data/view/SRR9336474
# https://www.ebi.ac.uk/ena/data/view/SRR9336475
# https://www.ebi.ac.uk/ena/data/view/SRR9336476

##########################
#                        #
# R commands start here: #
#                        #
##########################

# setting working directory:
# setwd("F:/olive/Desktop/bioinfogp/")  # <- (use always your own path)

# Install R packages (from CRAN)

# install.packages("R.utils")     # <- uncompress gz files
# install.packages("BiocManager") # <- Bioconductor manager

# Install R packages (from Bioconductor):

# BiocManager::install("Rbowtie2")  # <- align FASTQ files
# BiocManager::install("Rsamtools") # <- manipulate SAM files
# BiocManager::install("Rsubread")  # <- quantify alignments
# BiocManager::install("DESeq2")    # <- differential gene expression

# load all libraries:

library("R.utils")
library("Rbowtie2")
library("Rsamtools")
library("Rsubread")
library("DESeq2")

# STEP 1: Uncompress all gz files with "gunzip":

gunzip("chromosomes/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz", remove=FALSE)

gunzip("fastq/1M_SRR9336468_1.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336468_2.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336469_1.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336469_2.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336470_1.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336470_2.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336471_1.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336471_2.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336472_1.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336472_2.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336473_1.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336473_2.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336474_1.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336474_2.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336475_1.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336475_2.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336476_1.fastq.gz", remove = FALSE)
gunzip("fastq/1M_SRR9336476_2.fastq.gz", remove = FALSE)

gunzip("genes/Saccharomyces_cerevisiae.R64-1-1.99.gff3.gz", remove = FALSE)

# STEP 2: Create genome index for bowtie2 with "bowtie2_build":

bowtie2_build("chromosomes/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa", bt2Index = "chromosomes/yeast")

# STEP 3: Align FASTQ files against indexed genome with "bowtie2":

dir.create("bam")

bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M68_pH5_0.04CO2_R1.sam", seq1 = "fastq/1M_SRR9336468_1.fastq", seq2 = "fastq/1M_SRR9336468_2.fastq", "--threads=3")       
bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M69_pH5_0.04CO2_R2.sam", seq1 = "fastq/1M_SRR9336469_1.fastq", seq2 = "fastq/1M_SRR9336469_2.fastq", "--threads=3")       
bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M70_pH5_0.04CO2_R3.sam", seq1 = "fastq/1M_SRR9336470_1.fastq", seq2 = "fastq/1M_SRR9336470_2.fastq", "--threads=3")       

bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M71_pH5_50CO2_R1.sam", seq1 = "fastq/1M_SRR9336471_1.fastq", seq2 = "fastq/1M_SRR9336471_2.fastq", "--threads=3")       
bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M72_pH5_50CO2_R2.sam", seq1 = "fastq/1M_SRR9336472_1.fastq", seq2 = "fastq/1M_SRR9336472_2.fastq", "--threads=3")       
bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M73_pH5_50CO2_R3.sam", seq1 = "fastq/1M_SRR9336473_1.fastq", seq2 = "fastq/1M_SRR9336473_2.fastq", "--threads=3")       

bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M74_pH3_0.04CO2_R1.sam", seq1 = "fastq/1M_SRR9336474_1.fastq", seq2 = "fastq/1M_SRR9336474_2.fastq", "--threads=3")       
bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M75_pH3_0.04CO2_R2.sam", seq1 = "fastq/1M_SRR9336475_1.fastq", seq2 = "fastq/1M_SRR9336475_2.fastq", "--threads=3")       
bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M76_pH3_0.04CO2_R3.sam", seq1 = "fastq/1M_SRR9336476_1.fastq", seq2 = "fastq/1M_SRR9336476_2.fastq", "--threads=3")       

# STEP 4: Convert SAM files into BAM (and indexes .bai) with "asBam"

asBam("bam/1M68_pH5_0.04CO2_R1.sam")
asBam("bam/1M69_pH5_0.04CO2_R2.sam")
asBam("bam/1M70_pH5_0.04CO2_R3.sam")

asBam("bam/1M71_pH5_50CO2_R1.sam")
asBam("bam/1M72_pH5_50CO2_R2.sam")
asBam("bam/1M73_pH5_50CO2_R3.sam")

asBam("bam/1M74_pH3_0.04CO2_R1.sam")
asBam("bam/1M75_pH3_0.04CO2_R2.sam")
asBam("bam/1M76_pH3_0.04CO2_R3.sam")

# [OPTIONAL] To check the content of gff3 file:
# file.show("genes/Saccharomyces_cerevisiae.R64-1-1.99.gff3")

# STEP 5: store all paths of bam files into "bamFiles" R object (use "list.files"):
bamFiles <- list.files(path="bam", pattern="*.bam$", full.names=TRUE)

# STEP 6: quantify "genes" with "featureCounts" and store results in R object "data"
data <- featureCounts(
                     files = bamFiles,
                 annot.ext = "genes/Saccharomyces_cerevisiae.R64-1-1.99.gff3",
           isGTFAnnotation = TRUE,
           GTF.featureType = "gene",
              GTF.attrType = "ID",
               isPairedEnd = TRUE,
     requireBothEndsMapped = TRUE,
                    minMQS = 20,
            strandSpecific = 2
        )

# STEP 7: Create a data.frame to store all replicate groups. IN THE SAME ORDER AS "bamFiles":
thesamples <- data.frame(conditions=c(
                                   "p5c0.04", "p5c0.04", "p5c0.04",
                                   "p5c50",   "p5c50",   "p5c50",
                                   "p3c0.04", "p3c0.04", "p3c0.04"
                                   ))
# STEP 8: Use "DESeqDataSetFromMatrix" to repare all raw counts and experimental design:
dds <- DESeqDataSetFromMatrix(data$counts, colData=thesamples, design = ~ conditions)

# STEP 9: Use "DESeq" function to normalize and calculate dispersions of gene expression data:
dds <- DESeq(dds)

# STEP 10: Use "results" function to get log2Ratios, pvalues... for comparison of interest:
P5C50vsP5C004 <- results(dds, contrast=c("conditions", "p5c50", "p5c0.04"))

#    NOTE: contrast = [column name in "thesamples" dataframe], [numerator], [denominator]

# STEP 11: Prepare a pretty table (data.frame) with all results:
FINAL_P5C50vsP5C004 <- data.frame(
                                     GENEID = row.names(P5C50vsP5C004),
                               log2BaseMean = log2(P5C50vsP5C004$baseMean),
                                  log2Ratio = P5C50vsP5C004$log2FoldChange,
                           STDERR_log2Ratio = P5C50vsP5C004$lfcSE,
                                     pvalue = P5C50vsP5C004$pvalue,
                                    padjust = P5C50vsP5C004$padj
                                     )

# STEP 12: Use "write.table" to create a text-tabulated file:
write.table(FINAL_P5C50vsP5C004, file="RESULTS_P5C50vsP5C004.tsv", sep="\t", row.names=FALSE)

# Upload text-tabulated file into FIESTA viewer to create an interactive MA plot:
#
# https://bioinfogp.cnb.csic.es/tools/FIESTA/index_server.php
#
###############################################
#                                             #
# Gene Identifier     : GENEID                #
#                                             #
# X axis (horizontal) : log2BaseMean          #
#                                             #
# Y axis (vertical)   : log2Ratio             #
#                                             #
# Error Y (vertical)  : STDERR_log2Ratio      #
#                                             #
# Spot Colors         : MA plot               #
#                                             #
# Press [Next]                                #
#                                             #
###############################################

# Wait few seconds and download the ".zip" file with all results.
# Uncompress ".zip" folder in your hard disk an open "index.html" with any browser.
#
# Filter (eg. "log2Ratio>0" and "padjust < 0.05"),
# sort results by padjust and save the table
#
# PROPOSED TASKS: 
# ==============
# 1. Filter and save down-regulated genes in comparison P5C50vsP5C004
#
# 2. Get up-regulated an down-regulated genes for comparison: "pH=3 CO2=0.04%" versus "pH=5 CO2=0.04%"
#
# 3. Play around with different filtering thresholds (padjust < 0.1; padjust <0.01; others...)
#
# 4. (DIFFICULT): Create a text-tabulated file with raw counts. Tip: assays(dds)$counts
#
# 5. (DIFFICULT): Create a text-tabulated file with normalized counts. Tip: assays(dds)$mu
#           NOTE: It is illustrative to look at normalized counts of differentially
#               expressed genes with good pvalues.
#
# 6. (DIFFICULT): add columns with normalized counts to the FINAL table (create file).
#
# Useful links:
# =============
# Illumina Sequencing Technology (Many thanks to Victoria Castro Illana!)
# https://www.youtube.com/watch?v=womKfikWlxM&frags=pl%2Cwn
#
# All about DESeq2 bioconductor package:
# https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
#
# About MA plots (wikipedia):
# https://en.wikipedia.org/wiki/MA_plot
#
# padjust (FDR calculation) clearly explained!!
# https://statquest.org/tag/fdr/


########################################################################
#
# strandSpecific = 0 means "no strand specific"
# strandSpecific = 1 means "fragment _1 is direct strand"
# strandSpecific = 2 means "fragment _1 is reverse strand"
#
# mRNA: 5'-AUGCGUCGACGUUUAGCAUGUCGAUGCUGAUUGCGAUCGAUGCCGACAAGAUAGUAA-3'
# 
#              (retrotranscription and second strand generation)
# 
# cDNA: 5'-ATGCGTCGACGTTTAGCATGTCGATGCTGATTGCGATCGATGCCGACAAGATAGTAA-3'
#          |||||||||||||||||||||||||||||||||||||||||||||||||||||||||
#       3'-tacgcagctgcaaatcgtacagctacgactaacgctagctacggctgttctatcatt-5'
#                                                    <-------------- 
#                                                  (note the polarity)     
#
#
# strandSpecific=0:  reads_?.fastq:              reads_?.fastq:
#                    5'-ATGCGTCGACGTTTA-3'       5'-ttactatcttgtcgg-3'                   
#                                                   -------------->
#                                                                                                                                                     
# strandSpecific=1:  reads_1.fastq:              reads_2.fastq:
#                    5'-ATGCGTCGACGTTTA-3'       5'-ttactatcttgtcgg-3'                   
#                                                   -------------->
#                                                                                                                     
# strandSpecific=2:  reads_1.fastq:              reads_2.fastq:
#                    5'-ttactatcttgtcgg-3'       5'-ATGCGTCGACGTTTA-3'                   
#                       -------------->
#
########################################################################