########################################################### # # # AprendeBioinformaticaEnCasa (06-04-2020) # # # # https://bioinfogp.cnb.csic.es/courses/quedateencasa # # twitch.tv/bioinfogp # # # ########################################################### # Differential gene expression analysis (RNA-seq) with R # Juan Carlos Oliveros, BioinfoGP, CNB-CSIC # oliveros@cnb.csic.es # "Physiological responses of Saccharomyces cerevisiae to industrially # relevant conditions: Slow growth, low pH, and high CO2 levels" # Hakkaart X, Liu Y, Hulst M, El Masoudi A, Peuscher E, Pronk J, van Gulik W, Daran-Lapujade P. # Biotechnol Bioeng. 2020 Mar;117(3):721-735. doi: 10.1002/bit.27210. Epub 2020 Jan 22. # https://www.ncbi.nlm.nih.gov/pubmed/31654410 # SampleID Description Replicate # SRR9336468 Chemostat pH5 0.04% CO2 R1 # SRR9336469 Chemostat pH5 0.04% CO2 R2 # SRR9336470 Chemostat pH5 0.04% CO2 R3 # SRR9336471 Chemostat pH5 50% CO2 R1 # SRR9336472 Chemostat pH5 50% CO2 R2 # SRR9336473 Chemostat pH5 50% CO2 R3 # SRR9336474 Chemostat pH3 0.04% CO2 R1 # SRR9336475 Chemostat pH3 0.04% CO2 R2 # SRR9336476 Chemostat pH3 0.04% CO2 R3 # original FASTQ files (HUGE!!!) # https://www.ebi.ac.uk/ena/data/view/SRR9336468 # https://www.ebi.ac.uk/ena/data/view/SRR9336469 # https://www.ebi.ac.uk/ena/data/view/SRR9336470 # https://www.ebi.ac.uk/ena/data/view/SRR9336471 # https://www.ebi.ac.uk/ena/data/view/SRR9336472 # https://www.ebi.ac.uk/ena/data/view/SRR9336473 # https://www.ebi.ac.uk/ena/data/view/SRR9336474 # https://www.ebi.ac.uk/ena/data/view/SRR9336475 # https://www.ebi.ac.uk/ena/data/view/SRR9336476 ########################## # # # R commands start here: # # # ########################## # setting working directory: # setwd("F:/olive/Desktop/bioinfogp/") # <- (use always your own path) # Install R packages (from CRAN) # install.packages("R.utils") # <- uncompress gz files # install.packages("BiocManager") # <- Bioconductor manager # Install R packages (from Bioconductor): # BiocManager::install("Rbowtie2") # <- align FASTQ files # BiocManager::install("Rsamtools") # <- manipulate SAM files # BiocManager::install("Rsubread") # <- quantify alignments # BiocManager::install("DESeq2") # <- differential gene expression # load all libraries: library("R.utils") library("Rbowtie2") library("Rsamtools") library("Rsubread") library("DESeq2") # STEP 1: Uncompress all gz files with "gunzip": # STEP 2: Create genome index for bowtie2 with "bowtie2_build": # STEP 3: Align FASTQ files against indexed genome with "bowtie2": # STEP 4: Convert SAM files into BAM (and indexes .bai) with "asBam" # [OPTIONAL] To check the content of gff3 file: # file.show("genes/Saccharomyces_cerevisiae.R64-1-1.99.gff3") # STEP 5: store all paths of bam files into "bamFiles" R object (use "list.files"): # STEP 6: quantify "genes" with "featureCounts" and store results in R object "data" # STEP 7: Create a data.frame to store all replicate groups. IN THE SAME ORDER AS "bamFiles": # STEP 8: Use "DESeqDataSetFromMatrix" to repare all raw counts and experimental design: # STEP 9: Use "DESeq" function to normalize and calculate dispersions of gene expression data: # STEP 10: Use "results" function to get log2Ratios, pvalues... for comparison of interest: # NOTE: contrast = [column name in "thesamples" dataframe], [numerator], [denominator] # STEP 11: Prepare a pretty table (data.frame) with all results: # STEP 12: Use "write.table" to create a text-tabulated file: # Upload text-tabulated file into FIESTA viewer to create an interactive MA plot: # # https://bioinfogp.cnb.csic.es/tools/FIESTA/index_server.php # ############################################### # # # Gene Identifier : GENEID # # # # X axis (horizontal) : log2BaseMean # # # # Y axis (vertical) : log2Ratio # # # # Error Y (vertical) : STDERR_log2Ratio # # # # Spot Colors : MA plot # # # # Press [Next] # # # ############################################### # Wait few seconds and download the ".zip" file with all results. # Uncompress ".zip" folder in your hard disk an open "index.html" with any browser. # # Filter (eg. "log2Ratio>0" and "padjust < 0.05"), # sort results by padjust and save the table # # PROPOSED TASKS: # ============== # 1. Filter and save down-regulated genes in comparison P5C50vsP5C004 # # 2. Get up-regulated an down-regulated genes for comparison: "pH=3 CO2=0.04%" versus "pH=5 CO2=0.04%" # # 3. Play around with different filtering thresholds (padjust < 0.1; padjust <0.01; others...) # # 4. (DIFFICULT): Create a text-tabulated file with raw counts. Tip: assays(dds)$counts # # 5. (DIFFICULT): Create a text-tabulated file with normalized counts. Tip: assays(dds)$mu # NOTE: It is illustrative to look at normalized counts of differentially # expressed genes with good pvalues. # # 6. (DIFFICULT): add columns with normalized counts to the FINAL table (create file). # # Useful links: # ============= # Illumina Sequencing Technology (Many thanks to Victoria Castro Illana!) # https://www.youtube.com/watch?v=womKfikWlxM&frags=pl%2Cwn # # All about DESeq2 bioconductor package: # https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html # # About MA plots (wikipedia): # https://en.wikipedia.org/wiki/MA_plot # # padjust (FDR calculation) clearly explained!! # https://statquest.org/tag/fdr/ ######################################################################## # # strandSpecific = 0 means "no strand specific" # strandSpecific = 1 means "fragment _1 is direct strand" # strandSpecific = 2 means "fragment _1 is reverse strand" # # mRNA: 5'-AUGCGUCGACGUUUAGCAUGUCGAUGCUGAUUGCGAUCGAUGCCGACAAGAUAGUAA-3' # # (retrotranscription and second strand generation) # # cDNA: 5'-ATGCGTCGACGTTTAGCATGTCGATGCTGATTGCGATCGATGCCGACAAGATAGTAA-3' # ||||||||||||||||||||||||||||||||||||||||||||||||||||||||| # 3'-tacgcagctgcaaatcgtacagctacgactaacgctagctacggctgttctatcatt-5' # <-------------- # (note the polarity) # # # strandSpecific=0: reads_?.fastq: reads_?.fastq: # 5'-ATGCGTCGACGTTTA-3' 5'-ttactatcttgtcgg-3' # --------------> # # strandSpecific=1: reads_1.fastq: reads_2.fastq: # 5'-ATGCGTCGACGTTTA-3' 5'-ttactatcttgtcgg-3' # --------------> # # strandSpecific=2: reads_1.fastq: reads_2.fastq: # 5'-ttactatcttgtcgg-3' 5'-ATGCGTCGACGTTTA-3' # --------------> # ########################################################################