diff --git a/script/4_parse_PSIblast.R b/script/4_parse_PSIblast.R index 654a7291bea608be1aeb69c68cb44b6e129c6146..a9f47ab2b1424144cf31aa886aa43df3b7d6fc35 100644 --- a/script/4_parse_PSIblast.R +++ b/script/4_parse_PSIblast.R @@ -8,7 +8,9 @@ if (length(args)!=6) { #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta") #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta") +#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta", "~/2021_legio/blastdb/phyloref/phyloref_nuc", "/home/mcariou/2021_legio/Taxo/accessionTaxa.sql") +print("reading arguments and data...") file<-args[1] eval=as.numeric(args[2]) @@ -16,107 +18,106 @@ percid=as.numeric(args[3]) overlap=as.numeric(args[4]) nind=as.numeric(args[5]) fasta=args[6] - +blastdb=args[7] +Taxonomizer=args[8] blastn<-read.table(file, h=F, fill=T) - names(blastn)<-c("qseqid", "qlen", "sseqid", "slen", "length", "pident","evalue","nident","mismatch", "gapopen", "qstart", "qend", "qseq", "score", "sstart", "send", "sseq", "sstrand") -### report - print(paste("Number of total hit: ", nrow(blastn))) - print(paste("Number of query:", length(unique(blastn$qseqid)))) - -print("Species range (row):") - -print(table(blastn$species)) - -### Filters +print("Filtering based on hit caracteristics") blastn<-blastn[blastn$evalue<=eval,] - print(paste("Number of hits after evalue filter: ", nrow(blastn))) blastn<-blastn[blastn$pident>=percid*100,] - print(paste("Number of hits after perc id filter: ", nrow(blastn))) -blastn<-blastn[blastn$nident/blastn$qlen>=overlap,] - -print(paste("Number of hits after perc id filter: ", nrow(blastn))) - -#blastn<-blastn[blastn$species!="",] - - +blastn<-blastn[blastn$slen/as.numeric(blastn$qlen)>=overlap,] print(paste("Number of hits after unidentified removal: ", nrow(blastn))) -##### Get species id -blastn$realid<-sapply(as.character(blastn$sseqid), function(x) strsplit(x, ".", fixed=TRUE)[[1]][1]) - -## Use an reutils? -library(reutils) +##### Reformate subject id +blastn$realid<-sapply(as.character(blastn$sseqid), function(x) paste0( + strsplit(x, ".", fixed=TRUE)[[1]][1], ".", + strsplit(x, ".", fixed=TRUE)[[1]][2])) +#### Keep one sequence per subject id +selecthit<-function(subjectid){ + tmp<-blastn[blastn$sseqid==subjectid,] + return(tmp[order(tmp$evalue),][order(tmp$length),][1,]) +} +blastn<-as.data.frame(t(sapply(unique(blastn$sseqid), selecthit))) -uid<-blastn$realid[1] -tmp<-efetch(uid, db = "nuccore", rettype="gb") +print(paste("Keep best hit for each subject, nrow = ", nrow(blastn))) -### Use Taxonomizr +### Use Taxonomizr to get species names #install.packages("taxonomizr") library(taxonomizr) -prepareDatabase("accessionTaxa.sql") -taxaId<-accessionToTaxa(uid,"accessionTaxa.sql") +#prepareDatabase("/home/mcariou/2021_legio/accessionTaxa.sql") +blastn$realid<-unlist(blastn$realid) +#taxaId<-accessionToTaxa(blastn$realid,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql") +taxaId<-accessionToTaxa(blastn$realid, Taxonomizer) +#Taxa<-getTaxonomy(taxaId,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql") +Taxa<-getTaxonomy(taxaId,Taxonomizer) -Taxa<-getTaxonomy(taxaId,'accessionTaxa.sql') +print(paste0("Species names retrieved; dim blastn et taxa: ", dim(blastn), ", ",dim(Taxa))) -head(Taxa) +blastn<-cbind(blastn, Taxa) -dim(blastn) -dim(Taxa) +#### Keep one sequence per subject id +selecthit<-function(species){ + tmp<-blastn[blastn$specie==species,] + tmp[order(tmp$evalue),][order(tmp$length),] + return(tmp$sseqid[1]) +} -blastn<-cbind(blastn, Taxa) +blastn$evalue<-unlist(blastn$evalue) +blastn$length<-unlist(blastn$length) +listid<-na.omit(unlist(sapply(unique(blastn$species)[unique(blastn$species)!=""], function(x) selecthit(x)))) +blastn$sseqid<-unlist(blastn$sseqid) -#getsp<-function(id){ -# cmd=paste0("efetch -db nuccore -id ", id, " -format docsum | xtract -pattern DocumentSummary -element Organism | sed 's/ /_/g'") -# species<-system(cmd) -# return(species) -# } +blastn<-blastn[blastn$sseqid %in% listid,] +print(paste("Keep best hit for each species, nrow = ", nrow(blastn))) -#species<-sapply(blastn$realid, function(x) getsp(x)) +listid_short<-sapply(listid, function(x) substr(x, 1, nchar(x)-2)) +#### Retrieve sequences via blastcmd +system(paste0("rm ", fasta)) +for (seq in listid_short){ +cmd<-paste0("blastdbcmd -db ", blastdb, " -entry ", seq, " >> ", fasta) +system(cmd) +} -#### Garder 1 séquence par espèce +#### Change names in the fasta +library(ape) +aln<-read.dna(fasta, format="fasta") +names(listid_short)<-gsub(names(listid_short), pattern=" ", replacement="_") +names(aln)<-sapply(names(aln), function(x) strsplit(x, " ")[[1]][1]) +newnames<-sapply(names(aln), function(x){ + names(listid_short)[listid_short==x] +}) +names(aln)<-newnames -#### Now select 1 sequence per species -#species<-"Legionella_pneumophila_subsp._pneumophila_str._Philadelphia_1" +fasta_aln<-gsub(fasta, pattern=".fasta", replacement="_aln.fasta", fixed=TRUE) -#selecthit<-function(species){ -# tmp<-blastn[blastn$specie==species,] -# tmp[order(tmp$evalue),][order(tmp$length),] -# return(tmp$sseqid[1]) -#} +write.dna(aln, file=fasta_aln, format="fasta", nbcol = 6, colsep = "", colw = 10) -#listid<-sapply(unique(blastn$species)[unique(blastn$species)!=""], function(x) selecthit(x)) -#listid2<-sapply(as.character(listid), function(x) substr(x, 1, nchar(x)-2)) -#for (seq in listid2){ -#cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ", seq, " >> ", fasta) -#system(cmd) -#} diff --git a/script/4_parse_PSIblast.sh b/script/4_parse_PSIblast.sh index 1e3b0998c6c845140d8d74d090bea8d9fe2351e0..46d22624287aae488d64b2541f84dbabdc6aa1e0 100755 --- a/script/4_parse_PSIblast.sh +++ b/script/4_parse_PSIblast.sh @@ -21,6 +21,9 @@ echo "--------------------------------------------------" module load R +BLASTDBNUC="~/2021_legio/blastdb/phyloref/phyloref_nuc" +TAXO="/home/mcariou/2021_legio/Taxo/accessionTaxa.sql + # variable OUT_BLAST=$1 TAB_SP=$2 diff --git a/script/4_parse_PSIblastTMP.R b/script/4_parse_PSIblastTMP.R index 68c90c714e29cf95a578a7d11ad86b1de90a60a1..a9f47ab2b1424144cf31aa886aa43df3b7d6fc35 100644 --- a/script/4_parse_PSIblastTMP.R +++ b/script/4_parse_PSIblastTMP.R @@ -8,7 +8,7 @@ if (length(args)!=6) { #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta") #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta") -#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta") +#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta", "~/2021_legio/blastdb/phyloref/phyloref_nuc", "/home/mcariou/2021_legio/Taxo/accessionTaxa.sql") print("reading arguments and data...") @@ -18,7 +18,8 @@ percid=as.numeric(args[3]) overlap=as.numeric(args[4]) nind=as.numeric(args[5]) fasta=args[6] - +blastdb=args[7] +Taxonomizer=args[8] blastn<-read.table(file, h=F, fill=T) @@ -60,9 +61,12 @@ library(taxonomizr) #prepareDatabase("/home/mcariou/2021_legio/accessionTaxa.sql") blastn$realid<-unlist(blastn$realid) -taxaId<-accessionToTaxa(blastn$realid,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql") +#taxaId<-accessionToTaxa(blastn$realid,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql") +taxaId<-accessionToTaxa(blastn$realid, Taxonomizer) + +#Taxa<-getTaxonomy(taxaId,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql") +Taxa<-getTaxonomy(taxaId,Taxonomizer) -Taxa<-getTaxonomy(taxaId,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql") print(paste0("Species names retrieved; dim blastn et taxa: ", dim(blastn), ", ",dim(Taxa))) blastn<-cbind(blastn, Taxa) @@ -88,7 +92,7 @@ listid_short<-sapply(listid, function(x) substr(x, 1, nchar(x)-2)) #### Retrieve sequences via blastcmd system(paste0("rm ", fasta)) for (seq in listid_short){ -cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ", seq, " >> ", fasta) +cmd<-paste0("blastdbcmd -db ", blastdb, " -entry ", seq, " >> ", fasta) system(cmd) } diff --git a/script/4_parse_PSIblastTMP.sh b/script/4_parse_PSIblastTMP.sh index b1367785a41795326a10951d4f5ab54840369656..aa99b1ebc296521d7e4081281c94435b45d7c5a3 100755 --- a/script/4_parse_PSIblastTMP.sh +++ b/script/4_parse_PSIblastTMP.sh @@ -20,6 +20,9 @@ echo "--------------------------------------------------" ################################################################################################################## module load R +BLASTDBNUC="~/2021_legio/blastdb/phyloref/phyloref_nuc" +TAXO="/home/mcariou/2021_legio/Taxo/accessionTaxa.sql" + # variable OUT_BLAST=$1 @@ -58,7 +61,7 @@ echo "sequences already retrieved" echo $FASTA else echo "retrieve fasta" -Rscript --vanilla $rscript $OUT_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA > $FASTA_REP/paste_blast_test.log +Rscript --vanilla $rscript $OUT_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA $BLASTDBNUC $TAXO > $FASTA_REP/paste_blast_test.log fi ;