########################################################### # # # AprendeBioinformaticaEnCasa (06-04-2020) # # # # https://bioinfogp.cnb.csic.es/courses/quedateencasa # # twitch.tv/bioinfogp # # # ########################################################### # Differential gene expression analysis (RNA-seq) with R # Juan Carlos Oliveros, BioinfoGP, CNB-CSIC # oliveros@cnb.csic.es # "Physiological responses of Saccharomyces cerevisiae to industrially # relevant conditions: Slow growth, low pH, and high CO2 levels" # Hakkaart X, Liu Y, Hulst M, El Masoudi A, Peuscher E, Pronk J, van Gulik W, Daran-Lapujade P. # Biotechnol Bioeng. 2020 Mar;117(3):721-735. doi: 10.1002/bit.27210. Epub 2020 Jan 22. # https://www.ncbi.nlm.nih.gov/pubmed/31654410 # SampleID Description Replicate # SRR9336468 Chemostat pH5 0.04% CO2 R1 # SRR9336469 Chemostat pH5 0.04% CO2 R2 # SRR9336470 Chemostat pH5 0.04% CO2 R3 # SRR9336471 Chemostat pH5 50% CO2 R1 # SRR9336472 Chemostat pH5 50% CO2 R2 # SRR9336473 Chemostat pH5 50% CO2 R3 # SRR9336474 Chemostat pH3 0.04% CO2 R1 # SRR9336475 Chemostat pH3 0.04% CO2 R2 # SRR9336476 Chemostat pH3 0.04% CO2 R3 # original FASTQ files (HUGE!!!) # https://www.ebi.ac.uk/ena/data/view/SRR9336468 # https://www.ebi.ac.uk/ena/data/view/SRR9336469 # https://www.ebi.ac.uk/ena/data/view/SRR9336470 # https://www.ebi.ac.uk/ena/data/view/SRR9336471 # https://www.ebi.ac.uk/ena/data/view/SRR9336472 # https://www.ebi.ac.uk/ena/data/view/SRR9336473 # https://www.ebi.ac.uk/ena/data/view/SRR9336474 # https://www.ebi.ac.uk/ena/data/view/SRR9336475 # https://www.ebi.ac.uk/ena/data/view/SRR9336476 ########################## # # # R commands start here: # # # ########################## # setting working directory: # setwd("F:/olive/Desktop/bioinfogp/") # <- (use always your own path) # Install R packages (from CRAN) # install.packages("R.utils") # <- uncompress gz files # install.packages("BiocManager") # <- Bioconductor manager # Install R packages (from Bioconductor): # BiocManager::install("Rbowtie2") # <- align FASTQ files # BiocManager::install("Rsamtools") # <- manipulate SAM files # BiocManager::install("Rsubread") # <- quantify alignments # BiocManager::install("DESeq2") # <- differential gene expression # load all libraries: library("R.utils") library("Rbowtie2") library("Rsamtools") library("Rsubread") library("DESeq2") # STEP 1: Uncompress all gz files with "gunzip": gunzip("chromosomes/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz", remove=FALSE) gunzip("fastq/1M_SRR9336468_1.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336468_2.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336469_1.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336469_2.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336470_1.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336470_2.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336471_1.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336471_2.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336472_1.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336472_2.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336473_1.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336473_2.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336474_1.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336474_2.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336475_1.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336475_2.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336476_1.fastq.gz", remove = FALSE) gunzip("fastq/1M_SRR9336476_2.fastq.gz", remove = FALSE) gunzip("genes/Saccharomyces_cerevisiae.R64-1-1.99.gff3.gz", remove = FALSE) # STEP 2: Create genome index for bowtie2 with "bowtie2_build": bowtie2_build("chromosomes/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa", bt2Index = "chromosomes/yeast") # STEP 3: Align FASTQ files against indexed genome with "bowtie2": dir.create("bam") bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M68_pH5_0.04CO2_R1.sam", seq1 = "fastq/1M_SRR9336468_1.fastq", seq2 = "fastq/1M_SRR9336468_2.fastq", "--threads=3") bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M69_pH5_0.04CO2_R2.sam", seq1 = "fastq/1M_SRR9336469_1.fastq", seq2 = "fastq/1M_SRR9336469_2.fastq", "--threads=3") bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M70_pH5_0.04CO2_R3.sam", seq1 = "fastq/1M_SRR9336470_1.fastq", seq2 = "fastq/1M_SRR9336470_2.fastq", "--threads=3") bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M71_pH5_50CO2_R1.sam", seq1 = "fastq/1M_SRR9336471_1.fastq", seq2 = "fastq/1M_SRR9336471_2.fastq", "--threads=3") bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M72_pH5_50CO2_R2.sam", seq1 = "fastq/1M_SRR9336472_1.fastq", seq2 = "fastq/1M_SRR9336472_2.fastq", "--threads=3") bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M73_pH5_50CO2_R3.sam", seq1 = "fastq/1M_SRR9336473_1.fastq", seq2 = "fastq/1M_SRR9336473_2.fastq", "--threads=3") bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M74_pH3_0.04CO2_R1.sam", seq1 = "fastq/1M_SRR9336474_1.fastq", seq2 = "fastq/1M_SRR9336474_2.fastq", "--threads=3") bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M75_pH3_0.04CO2_R2.sam", seq1 = "fastq/1M_SRR9336475_1.fastq", seq2 = "fastq/1M_SRR9336475_2.fastq", "--threads=3") bowtie2(bt2Index = "chromosomes/yeast", samOutput = "bam/1M76_pH3_0.04CO2_R3.sam", seq1 = "fastq/1M_SRR9336476_1.fastq", seq2 = "fastq/1M_SRR9336476_2.fastq", "--threads=3") # STEP 4: Convert SAM files into BAM (and indexes .bai) with "asBam" asBam("bam/1M68_pH5_0.04CO2_R1.sam") asBam("bam/1M69_pH5_0.04CO2_R2.sam") asBam("bam/1M70_pH5_0.04CO2_R3.sam") asBam("bam/1M71_pH5_50CO2_R1.sam") asBam("bam/1M72_pH5_50CO2_R2.sam") asBam("bam/1M73_pH5_50CO2_R3.sam") asBam("bam/1M74_pH3_0.04CO2_R1.sam") asBam("bam/1M75_pH3_0.04CO2_R2.sam") asBam("bam/1M76_pH3_0.04CO2_R3.sam") # [OPTIONAL] To check the content of gff3 file: # file.show("genes/Saccharomyces_cerevisiae.R64-1-1.99.gff3") # STEP 5: store all paths of bam files into "bamFiles" R object (use "list.files"): bamFiles <- list.files(path="bam", pattern="*.bam$", full.names=TRUE) # STEP 6: quantify "genes" with "featureCounts" and store results in R object "data" data <- featureCounts( files = bamFiles, annot.ext = "genes/Saccharomyces_cerevisiae.R64-1-1.99.gff3", isGTFAnnotation = TRUE, GTF.featureType = "gene", GTF.attrType = "ID", isPairedEnd = TRUE, requireBothEndsMapped = TRUE, minMQS = 20, strandSpecific = 2 ) # STEP 7: Create a data.frame to store all replicate groups. IN THE SAME ORDER AS "bamFiles": thesamples <- data.frame(conditions=c( "p5c0.04", "p5c0.04", "p5c0.04", "p5c50", "p5c50", "p5c50", "p3c0.04", "p3c0.04", "p3c0.04" )) # STEP 8: Use "DESeqDataSetFromMatrix" to repare all raw counts and experimental design: dds <- DESeqDataSetFromMatrix(data$counts, colData=thesamples, design = ~ conditions) # STEP 9: Use "DESeq" function to normalize and calculate dispersions of gene expression data: dds <- DESeq(dds) # STEP 10: Use "results" function to get log2Ratios, pvalues... for comparison of interest: P5C50vsP5C004 <- results(dds, contrast=c("conditions", "p5c50", "p5c0.04")) # NOTE: contrast = [column name in "thesamples" dataframe], [numerator], [denominator] # STEP 11: Prepare a pretty table (data.frame) with all results: FINAL_P5C50vsP5C004 <- data.frame( GENEID = row.names(P5C50vsP5C004), log2BaseMean = log2(P5C50vsP5C004$baseMean), log2Ratio = P5C50vsP5C004$log2FoldChange, STDERR_log2Ratio = P5C50vsP5C004$lfcSE, pvalue = P5C50vsP5C004$pvalue, padjust = P5C50vsP5C004$padj ) # STEP 12: Use "write.table" to create a text-tabulated file: write.table(FINAL_P5C50vsP5C004, file="RESULTS_P5C50vsP5C004.tsv", sep="\t", row.names=FALSE) # Upload text-tabulated file into FIESTA viewer to create an interactive MA plot: # # https://bioinfogp.cnb.csic.es/tools/FIESTA/index_server.php # ############################################### # # # Gene Identifier : GENEID # # # # X axis (horizontal) : log2BaseMean # # # # Y axis (vertical) : log2Ratio # # # # Error Y (vertical) : STDERR_log2Ratio # # # # Spot Colors : MA plot # # # # Press [Next] # # # ############################################### # Wait few seconds and download the ".zip" file with all results. # Uncompress ".zip" folder in your hard disk an open "index.html" with any browser. # # Filter (eg. "log2Ratio>0" and "padjust < 0.05"), # sort results by padjust and save the table # # PROPOSED TASKS: # ============== # 1. Filter and save down-regulated genes in comparison P5C50vsP5C004 # # 2. Get up-regulated an down-regulated genes for comparison: "pH=3 CO2=0.04%" versus "pH=5 CO2=0.04%" # # 3. Play around with different filtering thresholds (padjust < 0.1; padjust <0.01; others...) # # 4. (DIFFICULT): Create a text-tabulated file with raw counts. Tip: assays(dds)$counts # # 5. (DIFFICULT): Create a text-tabulated file with normalized counts. Tip: assays(dds)$mu # NOTE: It is illustrative to look at normalized counts of differentially # expressed genes with good pvalues. # # 6. (DIFFICULT): add columns with normalized counts to the FINAL table (create file). # # Useful links: # ============= # Illumina Sequencing Technology (Many thanks to Victoria Castro Illana!) # https://www.youtube.com/watch?v=womKfikWlxM&frags=pl%2Cwn # # All about DESeq2 bioconductor package: # https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html # # About MA plots (wikipedia): # https://en.wikipedia.org/wiki/MA_plot # # padjust (FDR calculation) clearly explained!! # https://statquest.org/tag/fdr/ ######################################################################## # # strandSpecific = 0 means "no strand specific" # strandSpecific = 1 means "fragment _1 is direct strand" # strandSpecific = 2 means "fragment _1 is reverse strand" # # mRNA: 5'-AUGCGUCGACGUUUAGCAUGUCGAUGCUGAUUGCGAUCGAUGCCGACAAGAUAGUAA-3' # # (retrotranscription and second strand generation) # # cDNA: 5'-ATGCGTCGACGTTTAGCATGTCGATGCTGATTGCGATCGATGCCGACAAGATAGTAA-3' # ||||||||||||||||||||||||||||||||||||||||||||||||||||||||| # 3'-tacgcagctgcaaatcgtacagctacgactaacgctagctacggctgttctatcatt-5' # <-------------- # (note the polarity) # # # strandSpecific=0: reads_?.fastq: reads_?.fastq: # 5'-ATGCGTCGACGTTTA-3' 5'-ttactatcttgtcgg-3' # --------------> # # strandSpecific=1: reads_1.fastq: reads_2.fastq: # 5'-ATGCGTCGACGTTTA-3' 5'-ttactatcttgtcgg-3' # --------------> # # strandSpecific=2: reads_1.fastq: reads_2.fastq: # 5'-ttactatcttgtcgg-3' 5'-ATGCGTCGACGTTTA-3' # --------------> # ########################################################################