covid_comp_primate-Young-focus_backup.Rnw

\documentclass[11pt, oneside]{article}   	% use "amsart" instead of "article" for AMSLaTeX format
%\usepackage{geometry}                		% See geometry.pdf to learn the layout options. There are lots.
%\geometry{letterpaper}                   		% ... or a4paper or a5paper or ... 
%\geometry{landscape}                		% Activate for for rotated page geometry
%\usepackage[parfill]{parskip}    		% Activate to begin paragraphs with an empty line rather than an indent
%\usepackage{graphicx}				% Use pdf, png, jpg, or eps with pdflatex; use eps in DVI mode
								% TeX will automatically convert eps --> pdf in pdflatex		
%\usepackage{amssymb}

\usepackage[utf8]{inputenc}
%\usepackage[cyr]{aeguill}
%\usepackage[francais]{babel}
%\usepackage{hyperref}


\title{Positive selection on genes interacting with SARS-Cov2, comparison of different analysis}
\author{Marie Cariou}
\date{October 2020}							% Activate to display a given date or no date

\begin{document}
\maketitle

\tableofcontents

\newpage

\section{Files manipulations}

\subsection{Read Janet Young's table}

<<>>=
workdir<-"/home/adminmarie/Documents/CIRI_BIBS_projects/2020_05_Etienne_covid/"

tab<-read.delim(paste0(workdir, 
  "data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv"),
	fill=T, h=T, dec=",")
dim(tab)

#names(tab)

@

\subsection{Read DGINN Young table}

DGINN-Young-primate table correspond to DGINN results, on the SAME alignment as Young-primate.

I will merge the 2 tables.

<<>>=
dginnY<-read.delim(paste0(workdir, 
  "data/summary_primate_young.res"), 
	fill=T, h=T)

dim(dginnY)

names(dginnY)
@


\subsection{Joining Young and DGINN Young table}

\textit{I hide some code corresponding to verifications of gene names coherence between tables}
<<results="hide", echo=FALSE>>=
head(tab)[,1:5]
# gene avec un nom bizar dans certaines colomne
tab[158,1:10]

#
length(unique(dginnY$Gene))
length(unique(tab$PreyGene))
length(unique(tab$Gene.name))

#quelle paire de colonne contient le plus de noms identiques
sum(unique(dginnY$Gene) %in% unique(tab$PreyGene))
sum(unique(dginnY$Gene) %in% unique(tab$Gene.name))

# dginn$Gene et tab$Gene.name presque identiques sauf 1 ligne. 
# Je soupçonne que c'est celle là:
tab[158,1:10]

# Verif:
tab[,1:10][(tab$Gene.name %in% unique(dginnY$Gene))==F,]
# yep

# Remplacement manuel par
as.character(unique(dginnY$Gene)[(unique(dginnY$Gene) %in% tab$Gene.name)==F])
# dans le tableau de Janet

val_remp=as.character(unique(dginnY$Gene)[(unique(dginnY$Gene) %in% tab$Gene.name)==F])

tab$Gene.name<-as.character(tab$Gene.name)

tab$Gene.name[158]<-val_remp

sum(unique(dginnY$Gene) %in% unique(tab$Gene.name))
@

<<>>=

add_col<-function(method="PamlM1M2"){

tmp<-dginnY[dginnY$Method==method,
	   c("Gene", "Omega", "PosSel", "PValue", "NbSites", "PSS")]

names(tmp)<-c("Gene.name", paste0("Omega_", method), 
	      paste0("PosSel_", method), paste0("PValue_", method), 
	      paste0("NbSites_", method), paste0("PSS_", method))

tab<-merge(tab, tmp, by="Gene.name")

return(tab)
}

tab<-add_col("PamlM1M2")
tab<-add_col("PamlM7M8")
tab<-add_col("BppM1M2")
tab<-add_col("BppM7M8")


# Manip pour la colonne BUSTED

tmp<-dginnY[dginnY$Method=="BUSTED",c("Gene", "Omega", "PosSel", "PValue")]
names(tmp)<-c("Gene.name", "Omega_BUSTED", "PosSel_BUSTED", "PValue_BUSTED")
tab<-merge(tab, tmp, by="Gene.name")

tmp<-dginnY[dginnY$Method=="MEME",c("Gene", "NbSites", "PSS")]
names(tmp)<-c("Gene.name", "NbSites_MEME", "PSS_MEME")
tab<-merge(tab, tmp, by="Gene.name")

@


\subsection{Read DGINN Table}

<<>>=
dginnT<-read.delim(paste0(workdir,
"/data/DGINN_202005281649summary_cleaned.csv"), 
      fill=T, h=T, sep=",")

dim(dginnT)

names(dginnT)

# Number of genes in dginn-primate output not present in the original table
dginnT[(dginnT$Gene %in% tab$Gene.name)==F,"Gene"]
# This includes paralogs, recombinations found by DGINN 
# and additionnal genes included on purpose

# Number of genes from the original list not present in DGINN output
tab[(tab$Gene.name %in% dginnT$Gene)==F,"Gene.name"]


names(dginnT)<-c("File", "Name", "Gene.name", "GeneSize", "dginn-primate_NbSpecies", "dginn-primate_omegaM0Bpp",
  "dginn-primate_omegaM0codeml", "dginn-primate_BUSTED", "dginn-primate_BUSTED.p.value",    
  "dginn-primate_MEME.NbSites",       "dginn-primate_MEME.PSS",           "dginn-primate_BppM1M2",           
  "dginn-primate_BppM1M2.p.value",    "dginn-primate_BppM1M2.NbSites",   "dginn-primate_BppM1M2.PSS",       
  "dginn-primate_BppM7M8",            "dginn-primate_BppM7M8.p.value",    "dginn-primate_BppM7M8.NbSites",   
  "dginn-primate_BppM7M8.PSS",        "dginn-primate_codemlM1M2",         "dginn-primate_codemlM1M2.p.value",
  "dginn-primate_codemlM1M2.NbSites", "dginn-primate_codemlM1M2.PSS",     "dginn-primate_codemlM7M8",        
  "dginn-primate_codemlM7M8.p.value", "dginn-primate_codemlM7M8.NbSites", "dginn-primate_codemlM7M8.PSS")

@


\subsection{Join Table and DGINN table}


<<>>=
tab<-merge(tab,dginnT, by="Gene.name", all.x=T)
@


\subsection{Write new table}
<<>>=
write.table(tab, 
	"COVID_PAMLresults_332hits_plusBatScreens_plusDGINN_20201014.txt", 
	row.names=F, quote=F, sep="\t")
@


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Comparisons Primates}

\subsection{DGINN results on Janet Young's alignments (DGINN-Young-primate) VS Janet Young's results}


Comparaison des Omega: colonne L "whole.gene.dN.dS.model.0" VS colonne "omega" dans la sortie de dginn.
<<omegaM7M8>>=

plot(tab$whole.gene.dN.dS.model.0, tab$Omega_PamlM7M8, 
     xlab="Omega Young-primate", ylab="Omega DGINN-Young-primate")
abline(0,1)
outlier<-tab[tab$whole.gene.dN.dS.model.0<0.2 & tab$Omega_PamlM7M8>0.4,]
text(x=outlier$whole.gene.dN.dS.model.0,
y=(outlier$Omega_PamlM7M8+0.01),
outlier$Gene.name)     

outlier<-tab[tab$whole.gene.dN.dS.model.0<0.6 & tab$Omega_PamlM7M8>0.7,]
text(x=outlier$whole.gene.dN.dS.model.0,
y=(outlier$Omega_PamlM7M8+0.01),
outlier$Gene.name)    

@


\subsection{DGINN results on Janet Young's alignments (DGINN-Young-primate) VS DGINN-full's results}


Comparaison des Omega: colonne L "whole.gene.dN.dS.model.0" VS colonne "omega" dans la sortie de dginn.
<<omegaM7M8_2>>=

tab$'dginn-primate_omegaM0Bpp'<-as.numeric(as.character(tab$'dginn-primate_omegaM0Bpp'))
plot(tab$'dginn-primate_omegaM0Bpp', tab$Omega_PamlM7M8, 
     xlab="DGINN-full's", ylab="Omega DGINN-Young-primate")
abline(0,1)    

outlier<-tab[tab$'dginn-primate_omegaM0Bpp'>0.4 & tab$Omega_PamlM7M8<0.2,]
text(x=outlier$'dginn-primate_omegaM0Bpp',
y=(outlier$Omega_PamlM7M8+0.01),
outlier$Gene.name)     

outlier<-tab[tab$'dginn-primate_omegaM0Bpp'>0.5 & tab$Omega_PamlM7M8<0.4,]
text(x=outlier$'dginn-primate_omegaM0Bpp',
y=(outlier$Omega_PamlM7M8+0.01),
outlier$Gene.name)  

@


\subsection{Janet Young's results (Young-primate) VS DGINN-full's results}


Comparaison des Omega: colonne L "whole.gene.dN.dS.model.0" VS colonne "omega" dans la sortie de dginn.
<<omegaM7M8_3>>=

plot(tab$whole.gene.dN.dS.model.0, as.numeric(as.character(tab$'dginn-primate_omegaM0Bpp')), 
     xlab="Omega Young-primate", ylab="DGINN-full's")
abline(0,1)


outlier<-tab[tab$whole.gene.dN.dS.model.0<0.4 & tab$'dginn-primate_omegaM0Bpp'>0.5,]
text(x=outlier$whole.gene.dN.dS.model.0,
y=outlier$'dginn-primate_omegaM0Bpp',
outlier$Gene.name)

@


\section{Overlap}


\subsection{Mondrian}

<<mondrianprimates>>=

library(Mondrian)

#######

monddata<-as.data.frame(tab$Gene.name)
dim(monddata)

dginnyoungtmp<-rowSums(cbind(tab$PosSel_PamlM1M2=="Y", tab$PosSel_PamlM7M8=="Y", 
tab$PosSel_BppM1M2=="Y", tab$PosSel_BppM7M8=="Y", tab$PosSel_BUSTED=="Y")) 

#monddata$primates_dginn_young<-ifelse(tmp$PosSel_PamlM7M8=="Y", 1,0)


dginnfulltmp<-rowSums(cbind(tab$'dginn-primate_BUSTED'=="Y", tab$'dginn-primate_BppM1M2'=="Y", 
tab$'dginn-primate_BppM7M8'=="Y", tab$'dginn-primate_codemlM1M2'=="Y", tab$'dginn-primate_codemlM7M8'=="Y")) 


monddata$primates_young<-ifelse(tab$pVal.M8vsM7<0.05, 1, 0)
#monddata$primates_cooper<-ifelse(tab$cooper.primates.M7.M8_p_val<0.05, 1, 0)


monddata$primates_dginn_young<-ifelse(dginnyoungtmp>=3, 1,0)
monddata$primates_dginn_full<-ifelse(dginnfulltmp>=3, 1,0)

mondrian(na.omit(monddata[,2:4]), labels=c("Young", "DGINN-Young >=3", "DGINN-full >=3" ))

#####
monddata$primates_dginn_young<-ifelse(dginnyoungtmp>=4, 1,0)
monddata$primates_dginn_full<-ifelse(dginnfulltmp>=4, 1,0)

mondrian(na.omit(monddata[,2:4]), labels=c("Young", "DGINN-Young >=4", "DGINN-full >=4"))
@

Comparison of results with the same method.
<<>>=

#####
monddata$primates_dginn_young<-tab$PosSel_BppM7M8=="Y"
monddata$primates_dginn_full<-tab$'dginn-primate_codemlM7M8'=="Y"

mondrian(na.omit(monddata[,2:4]), labels=c("Young", "DGINN-Young", "DGINN-full"), main="posel codeml M7M8")
@

\subsection{subsetR}

Just another representation of the same result.

<<subsetprimates>>=
library(UpSetR)
upsetdata<-as.data.frame(tab$Gene.name)


upsetdata$primates_young<-ifelse(tab$pVal.M8vsM7<0.05, 1, 0)

###
upsetdata$primates_dginn_young<-ifelse(dginnyoungtmp>=3, 1,0)
upsetdata$primates_dginn_full<-ifelse(dginnfulltmp>=3, 1,0)


upset(na.omit(upsetdata), nsets = 3, matrix.color = "#DC267F", 
main.bar.color = "#648FFF", sets.bar.color = "#FE6100")

###
upsetdata$primates_dginn_young<-ifelse(dginnyoungtmp>=4, 1,0)
upsetdata$primates_dginn_full<-ifelse(dginnfulltmp>=4, 1,0)

upset(na.omit(upsetdata), nsets = 3, matrix.color = "#DC267F", 
main.bar.color = "#648FFF", sets.bar.color = "#FE6100")

@

\section{Gene List}

Genes under positive selection for at least 4 methods.

<<>>=
dginnfulltmp<-rowSums(cbind(tab$'dginn-primate_BUSTED'=="Y",
 tab$'dginn-primate_BppM1M2'=="Y", 
tab$'dginn-primate_BppM7M8'=="Y", 
tab$'dginn-primate_codemlM1M2'=="Y", 
tab$'dginn-primate_codemlM7M8'=="Y")) 

tab$Gene.name[dginnfulltmp>=4 & is.na(dginnfulltmp)==F]

tab$Gene.name[dginnfulltmp>=3 & is.na(dginnfulltmp)==F]

tmp<-tab[dginnfulltmp>=4 & is.na(dginnfulltmp)==F, 
c("Gene.name","dginn-primate_BUSTED", "dginn-primate_BppM1M2",
 "dginn-primate_BppM7M8","dginn-primate_codemlM1M2","dginn-primate_codemlM7M8")]

write.table(tmp, "geneList_DGINN_full_primate_pos4.txt", row.names=F, quote=F)
@


\section{Shiny like}

<<shiny, fig.height=11>>=
makeFig1 <- function(df){

  # prepare data for colors etc
  colMethods <- c("deepskyblue4", "darkorange" , "deepskyblue3" , "mediumseagreen" , "yellow3" , "black")
  nameMethods <- c("BUSTED", "BppM1M2", "BppM7M8", "codemlM1M2", "codemlM7M8", "MEME")
  metColor <- data.frame(Name = nameMethods , Col = colMethods , stringsAsFactors = FALSE)
  
  # subset for this specific figure
  #df <- df[df$nbY >= 1, ] # to drop genes found by 0 methods (big datasets)
  xt <- df[, c("BUSTED", "BppM1M2", "BppM7M8", "codemlM1M2", "codemlM7M8")]
  xt$Gene <- df$Gene
  nbrMeth <- 5
  # reverse order of dataframe so that genes with the most Y are at the bottom (to be on top of the barplot)
  xt[,1:5] <- ifelse(xt[,1:5] == "Y", 1, 0)
  # sort and Filter the 0 lines
  xt<-xt[order(rowSums(xt[,1:5])),]
  xt<-xt[rowSums(xt[,1:5])>2,]  

  row.names(xt)<-xt$Gene
  xt<-xt[,1:5]

  colFig1 <- metColor[which(metColor$Name %in% colnames(xt)) , ]
  
  ##### PART 1 : NUMBER OF METHODS
  par(xpd = NA , mar=c(2,7,4,0) ,   oma = c(0,0,0,0) , mgp = c(3,0.3,0))
  
  h =  barplot(
    t(xt),
    border = NA ,
    axes = F ,
    col = adjustcolor(colFig1$Col, alpha.f = 1),
    horiz = T ,
    las  = 2 ,
    main = "Methods detecting positive selection" , 
    cex.main = 0.85,
    cex.names  = min(50/nrow(xt), 1.5)
  )
  
  axis(3, line = 0, at = c(0:nbrMeth), label = c("0", rep("", nbrMeth -1), nbrMeth), tck = 0.02)
  
  legend("bottomleft",
         horiz = T,
         border = colFig1$Col,
         legend = colFig1$Name, 
         fill = colFig1$Col,
         cex = 0.8,
         bty = "n",
         xpd = NA
  )
}


df<-read.delim(paste0(workdir,
"/data/DGINN_202005281649summary_cleaned.csv"), 
      fill=T, h=T, sep=",")


makeFig1(df)

@

\end{document}