\documentclass[11pt, oneside]{article} % use "amsart" instead of "article" for AMSLaTeX format %\usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots. %\geometry{letterpaper} % ... or a4paper or a5paper or ... %\geometry{landscape} % Activate for for rotated page geometry %\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent %\usepackage{graphicx} % Use pdf, png, jpg, or eps with pdflatex; use eps in DVI mode % TeX will automatically convert eps --> pdf in pdflatex %\usepackage{amssymb} \usepackage[utf8]{inputenc} %\usepackage[cyr]{aeguill} %\usepackage[francais]{babel} %\usepackage{hyperref} \title{Positive selection on genes interacting with SARS-Cov2, comparison of different analysis} \author{Marie Cariou} \date{October 2020} % Activate to display a given date or no date \begin{document} \maketitle \tableofcontents \newpage \section{Files manipulations} \subsection{Read Janet Young's table} <<>>= workdir<-"/home/adminmarie/Documents/CIRI_BIBS_projects/2020_05_Etienne_covid/" tab<-read.delim(paste0(workdir, "data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv"), fill=T, h=T, dec=",") dim(tab) #names(tab) @ \subsection{Read DGINN Young table} DGINN-Young-primate table correspond to DGINN results, on the SAME alignment as Young-primate. I will merge the 2 tables. <<>>= dginnY<-read.delim(paste0(workdir, "data/summary_primate_young.res"), fill=T, h=T) dim(dginnY) names(dginnY) @ \subsection{Joining Young and DGINN Young table} \textit{I hide some code corresponding to verifications of gene names coherence between tables} <<results="hide", echo=FALSE>>= head(tab)[,1:5] # gene avec un nom bizar dans certaines colomne tab[158,1:10] # length(unique(dginnY$Gene)) length(unique(tab$PreyGene)) length(unique(tab$Gene.name)) #quelle paire de colonne contient le plus de noms identiques sum(unique(dginnY$Gene) %in% unique(tab$PreyGene)) sum(unique(dginnY$Gene) %in% unique(tab$Gene.name)) # dginn$Gene et tab$Gene.name presque identiques sauf 1 ligne. # Je soupçonne que c'est celle là: tab[158,1:10] # Verif: tab[,1:10][(tab$Gene.name %in% unique(dginnY$Gene))==F,] # yep # Remplacement manuel par as.character(unique(dginnY$Gene)[(unique(dginnY$Gene) %in% tab$Gene.name)==F]) # dans le tableau de Janet val_remp=as.character(unique(dginnY$Gene)[(unique(dginnY$Gene) %in% tab$Gene.name)==F]) tab$Gene.name<-as.character(tab$Gene.name) tab$Gene.name[158]<-val_remp sum(unique(dginnY$Gene) %in% unique(tab$Gene.name)) @ <<>>= add_col<-function(method="PamlM1M2"){ tmp<-dginnY[dginnY$Method==method, c("Gene", "Omega", "PosSel", "PValue", "NbSites", "PSS")] names(tmp)<-c("Gene.name", paste0("Omega_", method), paste0("PosSel_", method), paste0("PValue_", method), paste0("NbSites_", method), paste0("PSS_", method)) tab<-merge(tab, tmp, by="Gene.name") return(tab) } tab<-add_col("PamlM1M2") tab<-add_col("PamlM7M8") tab<-add_col("BppM1M2") tab<-add_col("BppM7M8") # Manip pour la colonne BUSTED tmp<-dginnY[dginnY$Method=="BUSTED",c("Gene", "Omega", "PosSel", "PValue")] names(tmp)<-c("Gene.name", "Omega_BUSTED", "PosSel_BUSTED", "PValue_BUSTED") tab<-merge(tab, tmp, by="Gene.name") tmp<-dginnY[dginnY$Method=="MEME",c("Gene", "NbSites", "PSS")] names(tmp)<-c("Gene.name", "NbSites_MEME", "PSS_MEME") tab<-merge(tab, tmp, by="Gene.name") @ \subsection{Read DGINN Table} <<>>= dginnT<-read.delim(paste0(workdir, "/data/DGINN_202005281649summary_cleaned.csv"), fill=T, h=T, sep=",") dim(dginnT) names(dginnT) # Number of genes in dginn-primate output not present in the original table dginnT[(dginnT$Gene %in% tab$Gene.name)==F,"Gene"] # This includes paralogs, recombinations found by DGINN # and additionnal genes included on purpose # Number of genes from the original list not present in DGINN output tab[(tab$Gene.name %in% dginnT$Gene)==F,"Gene.name"] names(dginnT)<-c("File", "Name", "Gene.name", "GeneSize", "dginn-primate_NbSpecies", "dginn-primate_omegaM0Bpp", "dginn-primate_omegaM0codeml", "dginn-primate_BUSTED", "dginn-primate_BUSTED.p.value", "dginn-primate_MEME.NbSites", "dginn-primate_MEME.PSS", "dginn-primate_BppM1M2", "dginn-primate_BppM1M2.p.value", "dginn-primate_BppM1M2.NbSites", "dginn-primate_BppM1M2.PSS", "dginn-primate_BppM7M8", "dginn-primate_BppM7M8.p.value", "dginn-primate_BppM7M8.NbSites", "dginn-primate_BppM7M8.PSS", "dginn-primate_codemlM1M2", "dginn-primate_codemlM1M2.p.value", "dginn-primate_codemlM1M2.NbSites", "dginn-primate_codemlM1M2.PSS", "dginn-primate_codemlM7M8", "dginn-primate_codemlM7M8.p.value", "dginn-primate_codemlM7M8.NbSites", "dginn-primate_codemlM7M8.PSS") @ \subsection{Join Table and DGINN table} <<>>= tab<-merge(tab,dginnT, by="Gene.name", all.x=T) @ \subsection{Write new table} <<>>= write.table(tab, "COVID_PAMLresults_332hits_plusBatScreens_plusDGINN_20201014.txt", row.names=F, quote=F, sep="\t") @ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Comparisons Primates} \subsection{DGINN results on Janet Young's alignments (DGINN-Young-primate) VS Janet Young's results} Comparaison des Omega: colonne L "whole.gene.dN.dS.model.0" VS colonne "omega" dans la sortie de dginn. <<omegaM7M8>>= plot(tab$whole.gene.dN.dS.model.0, tab$Omega_PamlM7M8, xlab="Omega Young-primate", ylab="Omega DGINN-Young-primate") abline(0,1) outlier<-tab[tab$whole.gene.dN.dS.model.0<0.2 & tab$Omega_PamlM7M8>0.4,] text(x=outlier$whole.gene.dN.dS.model.0, y=(outlier$Omega_PamlM7M8+0.01), outlier$Gene.name) outlier<-tab[tab$whole.gene.dN.dS.model.0<0.6 & tab$Omega_PamlM7M8>0.7,] text(x=outlier$whole.gene.dN.dS.model.0, y=(outlier$Omega_PamlM7M8+0.01), outlier$Gene.name) @ \subsection{DGINN results on Janet Young's alignments (DGINN-Young-primate) VS DGINN-full's results} Comparaison des Omega: colonne L "whole.gene.dN.dS.model.0" VS colonne "omega" dans la sortie de dginn. <<omegaM7M8_2>>= tab$'dginn-primate_omegaM0Bpp'<-as.numeric(as.character(tab$'dginn-primate_omegaM0Bpp')) plot(tab$'dginn-primate_omegaM0Bpp', tab$Omega_PamlM7M8, xlab="DGINN-full's", ylab="Omega DGINN-Young-primate") abline(0,1) outlier<-tab[tab$'dginn-primate_omegaM0Bpp'>0.4 & tab$Omega_PamlM7M8<0.2,] text(x=outlier$'dginn-primate_omegaM0Bpp', y=(outlier$Omega_PamlM7M8+0.01), outlier$Gene.name) outlier<-tab[tab$'dginn-primate_omegaM0Bpp'>0.5 & tab$Omega_PamlM7M8<0.4,] text(x=outlier$'dginn-primate_omegaM0Bpp', y=(outlier$Omega_PamlM7M8+0.01), outlier$Gene.name) @ \subsection{Janet Young's results (Young-primate) VS DGINN-full's results} Comparaison des Omega: colonne L "whole.gene.dN.dS.model.0" VS colonne "omega" dans la sortie de dginn. <<omegaM7M8_3>>= plot(tab$whole.gene.dN.dS.model.0, as.numeric(as.character(tab$'dginn-primate_omegaM0Bpp')), xlab="Omega Young-primate", ylab="DGINN-full's") abline(0,1) outlier<-tab[tab$whole.gene.dN.dS.model.0<0.4 & tab$'dginn-primate_omegaM0Bpp'>0.5,] text(x=outlier$whole.gene.dN.dS.model.0, y=outlier$'dginn-primate_omegaM0Bpp', outlier$Gene.name) @ \section{Overlap} \subsection{Mondrian} <<mondrianprimates>>= library(Mondrian) ####### monddata<-as.data.frame(tab$Gene.name) dim(monddata) dginnyoungtmp<-rowSums(cbind(tab$PosSel_PamlM1M2=="Y", tab$PosSel_PamlM7M8=="Y", tab$PosSel_BppM1M2=="Y", tab$PosSel_BppM7M8=="Y", tab$PosSel_BUSTED=="Y")) #monddata$primates_dginn_young<-ifelse(tmp$PosSel_PamlM7M8=="Y", 1,0) dginnfulltmp<-rowSums(cbind(tab$'dginn-primate_BUSTED'=="Y", tab$'dginn-primate_BppM1M2'=="Y", tab$'dginn-primate_BppM7M8'=="Y", tab$'dginn-primate_codemlM1M2'=="Y", tab$'dginn-primate_codemlM7M8'=="Y")) monddata$primates_young<-ifelse(tab$pVal.M8vsM7<0.05, 1, 0) #monddata$primates_cooper<-ifelse(tab$cooper.primates.M7.M8_p_val<0.05, 1, 0) monddata$primates_dginn_young<-ifelse(dginnyoungtmp>=3, 1,0) monddata$primates_dginn_full<-ifelse(dginnfulltmp>=3, 1,0) mondrian(na.omit(monddata[,2:4]), labels=c("Young", "DGINN-Young >=3", "DGINN-full >=3" )) ##### monddata$primates_dginn_young<-ifelse(dginnyoungtmp>=4, 1,0) monddata$primates_dginn_full<-ifelse(dginnfulltmp>=4, 1,0) mondrian(na.omit(monddata[,2:4]), labels=c("Young", "DGINN-Young >=4", "DGINN-full >=4")) @ Comparison of results with the same method. <<>>= ##### monddata$primates_dginn_young<-tab$PosSel_BppM7M8=="Y" monddata$primates_dginn_full<-tab$'dginn-primate_codemlM7M8'=="Y" mondrian(na.omit(monddata[,2:4]), labels=c("Young", "DGINN-Young", "DGINN-full"), main="posel codeml M7M8") @ \subsection{subsetR} Just another representation of the same result. <<subsetprimates>>= library(UpSetR) upsetdata<-as.data.frame(tab$Gene.name) upsetdata$primates_young<-ifelse(tab$pVal.M8vsM7<0.05, 1, 0) ### upsetdata$primates_dginn_young<-ifelse(dginnyoungtmp>=3, 1,0) upsetdata$primates_dginn_full<-ifelse(dginnfulltmp>=3, 1,0) upset(na.omit(upsetdata), nsets = 3, matrix.color = "#DC267F", main.bar.color = "#648FFF", sets.bar.color = "#FE6100") ### upsetdata$primates_dginn_young<-ifelse(dginnyoungtmp>=4, 1,0) upsetdata$primates_dginn_full<-ifelse(dginnfulltmp>=4, 1,0) upset(na.omit(upsetdata), nsets = 3, matrix.color = "#DC267F", main.bar.color = "#648FFF", sets.bar.color = "#FE6100") @ \section{Gene List} Genes under positive selection for at least 4 methods. <<>>= dginnfulltmp<-rowSums(cbind(tab$'dginn-primate_BUSTED'=="Y", tab$'dginn-primate_BppM1M2'=="Y", tab$'dginn-primate_BppM7M8'=="Y", tab$'dginn-primate_codemlM1M2'=="Y", tab$'dginn-primate_codemlM7M8'=="Y")) tab$Gene.name[dginnfulltmp>=4 & is.na(dginnfulltmp)==F] tab$Gene.name[dginnfulltmp>=3 & is.na(dginnfulltmp)==F] tmp<-tab[dginnfulltmp>=4 & is.na(dginnfulltmp)==F, c("Gene.name","dginn-primate_BUSTED", "dginn-primate_BppM1M2", "dginn-primate_BppM7M8","dginn-primate_codemlM1M2","dginn-primate_codemlM7M8")] write.table(tmp, "geneList_DGINN_full_primate_pos4.txt", row.names=F, quote=F) @ \section{Shiny like} <<shiny, fig.height=11>>= makeFig1 <- function(df){ # prepare data for colors etc colMethods <- c("deepskyblue4", "darkorange" , "deepskyblue3" , "mediumseagreen" , "yellow3" , "black") nameMethods <- c("BUSTED", "BppM1M2", "BppM7M8", "codemlM1M2", "codemlM7M8", "MEME") metColor <- data.frame(Name = nameMethods , Col = colMethods , stringsAsFactors = FALSE) # subset for this specific figure #df <- df[df$nbY >= 1, ] # to drop genes found by 0 methods (big datasets) xt <- df[, c("BUSTED", "BppM1M2", "BppM7M8", "codemlM1M2", "codemlM7M8")] xt$Gene <- df$Gene nbrMeth <- 5 # reverse order of dataframe so that genes with the most Y are at the bottom (to be on top of the barplot) xt[,1:5] <- ifelse(xt[,1:5] == "Y", 1, 0) # sort and Filter the 0 lines xt<-xt[order(rowSums(xt[,1:5])),] xt<-xt[rowSums(xt[,1:5])>2,] row.names(xt)<-xt$Gene xt<-xt[,1:5] colFig1 <- metColor[which(metColor$Name %in% colnames(xt)) , ] ##### PART 1 : NUMBER OF METHODS par(xpd = NA , mar=c(2,7,4,0) , oma = c(0,0,0,0) , mgp = c(3,0.3,0)) h = barplot( t(xt), border = NA , axes = F , col = adjustcolor(colFig1$Col, alpha.f = 1), horiz = T , las = 2 , main = "Methods detecting positive selection" , cex.main = 0.85, cex.names = min(50/nrow(xt), 1.5) ) axis(3, line = 0, at = c(0:nbrMeth), label = c("0", rep("", nbrMeth -1), nbrMeth), tck = 0.02) legend("bottomleft", horiz = T, border = colFig1$Col, legend = colFig1$Name, fill = colFig1$Col, cex = 0.8, bty = "n", xpd = NA ) } df<-read.delim(paste0(workdir, "/data/DGINN_202005281649summary_cleaned.csv"), fill=T, h=T, sep=",") makeFig1(df) @ \end{document}