###########################################################
#                                                         #
#   AprendeBioinformaticaEnCasa (06-04-2020)              #
#                                                         #
#   https://bioinfogp.cnb.csic.es/courses/quedateencasa   #
#   twitch.tv/bioinfogp                                   #
#                                                         #
###########################################################

# Differential gene expression analysis (RNA-seq) with R

# Juan Carlos Oliveros, BioinfoGP, CNB-CSIC
# oliveros@cnb.csic.es


# "Physiological responses of Saccharomyces cerevisiae to industrially
# relevant conditions: Slow growth, low pH, and high CO2 levels"
# Hakkaart X, Liu Y, Hulst M, El Masoudi A, Peuscher E, Pronk J, van Gulik W, Daran-Lapujade P.
# Biotechnol Bioeng. 2020 Mar;117(3):721-735. doi: 10.1002/bit.27210. Epub 2020 Jan 22.

# https://www.ncbi.nlm.nih.gov/pubmed/31654410

# SampleID      Description               Replicate
# SRR9336468    Chemostat pH5 0.04% CO2   R1
# SRR9336469    Chemostat pH5 0.04% CO2   R2
# SRR9336470    Chemostat pH5 0.04% CO2   R3
# SRR9336471    Chemostat pH5 50% CO2     R1
# SRR9336472    Chemostat pH5 50% CO2     R2
# SRR9336473    Chemostat pH5 50% CO2     R3
# SRR9336474    Chemostat pH3 0.04% CO2   R1
# SRR9336475    Chemostat pH3 0.04% CO2   R2
# SRR9336476    Chemostat pH3 0.04% CO2   R3

# original FASTQ files (HUGE!!!)
# https://www.ebi.ac.uk/ena/data/view/SRR9336468
# https://www.ebi.ac.uk/ena/data/view/SRR9336469
# https://www.ebi.ac.uk/ena/data/view/SRR9336470
# https://www.ebi.ac.uk/ena/data/view/SRR9336471
# https://www.ebi.ac.uk/ena/data/view/SRR9336472
# https://www.ebi.ac.uk/ena/data/view/SRR9336473
# https://www.ebi.ac.uk/ena/data/view/SRR9336474
# https://www.ebi.ac.uk/ena/data/view/SRR9336475
# https://www.ebi.ac.uk/ena/data/view/SRR9336476

##########################
#                        #
# R commands start here: #
#                        #
##########################

# setting working directory:
# setwd("F:/olive/Desktop/bioinfogp/")  # <- (use always your own path)

# Install R packages (from CRAN)

# install.packages("R.utils")     # <- uncompress gz files
# install.packages("BiocManager") # <- Bioconductor manager

# Install R packages (from Bioconductor):

# BiocManager::install("Rbowtie2")  # <- align FASTQ files
# BiocManager::install("Rsamtools") # <- manipulate SAM files
# BiocManager::install("Rsubread")  # <- quantify alignments
# BiocManager::install("DESeq2")    # <- differential gene expression

# load all libraries:

library("R.utils")
library("Rbowtie2")
library("Rsamtools")
library("Rsubread")
library("DESeq2")

# STEP 1: Uncompress all gz files with "gunzip":

# STEP 2: Create genome index for bowtie2 with "bowtie2_build":

# STEP 3: Align FASTQ files against indexed genome with "bowtie2":

# STEP 4: Convert SAM files into BAM (and indexes .bai) with "asBam"

# [OPTIONAL] To check the content of gff3 file:
# file.show("genes/Saccharomyces_cerevisiae.R64-1-1.99.gff3")

# STEP 5: store all paths of bam files into "bamFiles" R object (use "list.files"):

# STEP 6: quantify "genes" with "featureCounts" and store results in R object "data"

# STEP 7: Create a data.frame to store all replicate groups. IN THE SAME ORDER AS "bamFiles":

# STEP 8: Use "DESeqDataSetFromMatrix" to repare all raw counts and experimental design:

# STEP 9: Use "DESeq" function to normalize and calculate dispersions of gene expression data:

# STEP 10: Use "results" function to get log2Ratios, pvalues... for comparison of interest:

#    NOTE: contrast = [column name in "thesamples" dataframe], [numerator], [denominator]

# STEP 11: Prepare a pretty table (data.frame) with all results:

# STEP 12: Use "write.table" to create a text-tabulated file:

# Upload text-tabulated file into FIESTA viewer to create an interactive MA plot:
#
# https://bioinfogp.cnb.csic.es/tools/FIESTA/index_server.php
#
###############################################
#                                             #
# Gene Identifier     : GENEID                #
#                                             #
# X axis (horizontal) : log2BaseMean          #
#                                             #
# Y axis (vertical)   : log2Ratio             #
#                                             #
# Error Y (vertical)  : STDERR_log2Ratio      #
#                                             #
# Spot Colors         : MA plot               #
#                                             #
# Press [Next]                                #
#                                             #
###############################################

# Wait few seconds and download the ".zip" file with all results.
# Uncompress ".zip" folder in your hard disk an open "index.html" with any browser.
#
# Filter (eg. "log2Ratio>0" and "padjust < 0.05"),
# sort results by padjust and save the table
#
# PROPOSED TASKS: 
# ==============
# 1. Filter and save down-regulated genes in comparison P5C50vsP5C004
#
# 2. Get up-regulated an down-regulated genes for comparison: "pH=3 CO2=0.04%" versus "pH=5 CO2=0.04%"
#
# 3. Play around with different filtering thresholds (padjust < 0.1; padjust <0.01; others...)
#
# 4. (DIFFICULT): Create a text-tabulated file with raw counts. Tip: assays(dds)$counts
#
# 5. (DIFFICULT): Create a text-tabulated file with normalized counts. Tip: assays(dds)$mu
#           NOTE: It is illustrative to look at normalized counts of differentially
#               expressed genes with good pvalues.
#
# 6. (DIFFICULT): add columns with normalized counts to the FINAL table (create file).
#
# Useful links:
# =============
# Illumina Sequencing Technology (Many thanks to Victoria Castro Illana!)
# https://www.youtube.com/watch?v=womKfikWlxM&frags=pl%2Cwn
#
# All about DESeq2 bioconductor package:
# https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
#
# About MA plots (wikipedia):
# https://en.wikipedia.org/wiki/MA_plot
#
# padjust (FDR calculation) clearly explained!!
# https://statquest.org/tag/fdr/


########################################################################
#
# strandSpecific = 0 means "no strand specific"
# strandSpecific = 1 means "fragment _1 is direct strand"
# strandSpecific = 2 means "fragment _1 is reverse strand"
#
# mRNA: 5'-AUGCGUCGACGUUUAGCAUGUCGAUGCUGAUUGCGAUCGAUGCCGACAAGAUAGUAA-3'
# 
#              (retrotranscription and second strand generation)
# 
# cDNA: 5'-ATGCGTCGACGTTTAGCATGTCGATGCTGATTGCGATCGATGCCGACAAGATAGTAA-3'
#          |||||||||||||||||||||||||||||||||||||||||||||||||||||||||
#       3'-tacgcagctgcaaatcgtacagctacgactaacgctagctacggctgttctatcatt-5'
#                                                    <-------------- 
#                                                  (note the polarity)     
#
#
# strandSpecific=0:  reads_?.fastq:              reads_?.fastq:
#                    5'-ATGCGTCGACGTTTA-3'       5'-ttactatcttgtcgg-3'                   
#                                                   -------------->
#                                                                                                                                                     
# strandSpecific=1:  reads_1.fastq:              reads_2.fastq:
#                    5'-ATGCGTCGACGTTTA-3'       5'-ttactatcttgtcgg-3'                   
#                                                   -------------->
#                                                                                                                     
# strandSpecific=2:  reads_1.fastq:              reads_2.fastq:
#                    5'-ttactatcttgtcgg-3'       5'-ATGCGTCGACGTTTA-3'                   
#                       -------------->
#
########################################################################