########################################################### # # # AprendeBioinformaticaEnCasa (04-05-2020) # # # # https://bioinfogp.cnb.csic.es/courses/quedateencasa # # twitch.tv/bioinfogp # # # ########################################################### # Differential gene expression analysis (RNA-seq) with R # Juan Carlos Oliveros, BioinfoGP, CNB-CSIC # oliveros@cnb.csic.es # "Physiological responses of Saccharomyces cerevisiae to industrially # relevant conditions: Slow growth, low pH, and high CO2 levels" # Hakkaart X, Liu Y, Hulst M, El Masoudi A, Peuscher E, Pronk J, van Gulik W, Daran-Lapujade P. # Biotechnol Bioeng. 2020 Mar;117(3):721-735. doi: 10.1002/bit.27210. Epub 2020 Jan 22. # https://www.ncbi.nlm.nih.gov/pubmed/31654410 # SampleID Description Replicate # SRR9336468 Chemostat pH5 0.04% CO2 R1 # SRR9336469 Chemostat pH5 0.04% CO2 R2 # SRR9336470 Chemostat pH5 0.04% CO2 R3 # SRR9336471 Chemostat pH5 50% CO2 R1 # SRR9336472 Chemostat pH5 50% CO2 R2 # SRR9336473 Chemostat pH5 50% CO2 R3 # SRR9336474 Chemostat pH3 0.04% CO2 R1 # SRR9336475 Chemostat pH3 0.04% CO2 R2 # SRR9336476 Chemostat pH3 0.04% CO2 R3 # original FASTQ files (HUGE!!!) # https://www.ebi.ac.uk/ena/data/view/SRR9336468 # https://www.ebi.ac.uk/ena/data/view/SRR9336469 # https://www.ebi.ac.uk/ena/data/view/SRR9336470 # https://www.ebi.ac.uk/ena/data/view/SRR9336471 # https://www.ebi.ac.uk/ena/data/view/SRR9336472 # https://www.ebi.ac.uk/ena/data/view/SRR9336473 # https://www.ebi.ac.uk/ena/data/view/SRR9336474 # https://www.ebi.ac.uk/ena/data/view/SRR9336475 # https://www.ebi.ac.uk/ena/data/view/SRR9336476 ########################## # # # R commands start here: # # # ########################## setwd("F:/olive/Desktop/bioinfogp/") library("R.utils") library("Rbowtie2") library("Rsamtools") library("Rsubread") library("DESeq2") gunzip("chromosomes/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz", remove=FALSE) gunzip("genes/Saccharomyces_cerevisiae.R64-1-1.99.gff3.gz", remove = FALSE) compressedFastq <- list.files(path="fastq_50k/", pattern="*.fastq.gz$", full.names=TRUE) for (t in 1:NROW(compressedFastq)) { gunzip(compressedFastq[t], remove=FALSE) } allFastq_1 <- list.files(path="fastq_50k/", pattern="*_1.fastq$", full.names=TRUE) allFastq_2 <- list.files(path="fastq_50k/", pattern="*_2.fastq$", full.names=TRUE) PROJECT <- data.frame(fastq_1 = allFastq_1, fastq_2 = allFastq_2) write.table(PROJECT, file="myproject.tsv", sep="\t", row.names=FALSE) PROJECT <- read.table("myproject.tsv", header=TRUE, sep="\t") bowtie2_build("chromosomes/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa", bt2Index = "chromosomes/yeast") dir.create("aligned") bamFiles <- vector() # Useful link: data structures in R (vectors, lists, data frames...) # https://rc2e.com/datastructures for(t in 1:NROW(PROJECT)) { myfq1 <- PROJECT$fastq_1[t] myfq2 <- PROJECT$fastq_2[t] mysam <- paste0("aligned/", PROJECT$conditions[t], "-", PROJECT$replicate[t], ".sam") bowtie2(bt2Index = "chromosomes/yeast", samOutput = mysam, seq1 = myfq1, seq2 = myfq2, "--threads=3") asBam(mysam) bamFiles[t] <- paste0("aligned/", PROJECT$conditions[t], "-", PROJECT$replicate[t], ".bam") } # alternative code: # # bamFiles <- vector() # for(t in 1:NROW(PROJECT)) # { # bamFiles[t] <- paste0("aligned/", PROJECT$conditions[t], "-", PROJECT$replicate[t], ".bam") # } ## TASK: mark duplicates in bam files using samtools in UNIX environment ## TIP: use "sam" (already sorted by names) as input file # Notice the parameter "ignoreDup". Get more info about this function # writing ?featureCounts in R console data <- featureCounts( files = bamFiles, annot.ext = "genes/Saccharomyces_cerevisiae.R64-1-1.99.gff3", isGTFAnnotation = TRUE, GTF.featureType = "gene", GTF.attrType = "ID", isPairedEnd = TRUE, requireBothEndsMapped = TRUE, minMQS = 20, strandSpecific = 2, ignoreDup = FALSE, nthreads = 3 ) dds <- DESeqDataSetFromMatrix(data$counts, colData=PROJECT, design = ~ conditions) dds <- DESeq(dds) # sub function is to substitute a subtext by another text in a string or a vector row.names(dds) <- sub("gene:","",row.names(dds)) resP3vsP5 <- results(dds, contrast = c("conditions", "P3C004", "P5C004")) # Desglosed contrast vector (alternative code): # mycontrast <- vector() # mycontrast[1]="conditions" # mycontrast[2]="P3C004" # mycontrast[3]="P5C004" # resP3vsP5 <- results(dds, contrast = mycontrast) # https://www.ensembl.org/index.html finalP3vsP5 <- data.frame( GENEID = row.names(resP3vsP5), log2BaseMeanP3vsP5 = log2(resP3vsP5$baseMean), log2RatioP3vsP5 = resP3vsP5$log2FoldChange, Stderr_log2RatioP3vsP5 = resP3vsP5$lfcSE, pvalue = resP3vsP5$pvalue, padjust = resP3vsP5$padj, assays(dds)$counts ) # use "check.names=FALSE" to avoid header columns to be modified by R annots <- read.table("mart_export.txt", header=TRUE, sep="\t", quote="", check.names=FALSE) # merge function is to add columns from a dataframe to another data frame by using defined columns as "key" finalP3vsP5 <- merge(finalP3vsP5, annots, by.x = "GENEID", by.y = "Gene stable ID", all.x = TRUE) write.table(finalP3vsP5, file="GeneExpression_pH3C004_vs_PH5C004.tsv", sep="\t", row.names=FALSE) # Created table is ready to be uploaded to FIESTA viewer: # https://bioinfogp.cnb.csic.es/tools/FIESTA/index.php