diff --git a/script/4_parse_PSIblast.R b/script/4_parse_PSIblast.R
index 654a7291bea608be1aeb69c68cb44b6e129c6146..a9f47ab2b1424144cf31aa886aa43df3b7d6fc35 100644
--- a/script/4_parse_PSIblast.R
+++ b/script/4_parse_PSIblast.R
@@ -8,7 +8,9 @@ if (length(args)!=6) {
 
 #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
 #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
+#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta", "~/2021_legio/blastdb/phyloref/phyloref_nuc", "/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
 
+print("reading arguments and data...")
 
 file<-args[1]
 eval=as.numeric(args[2])
@@ -16,107 +18,106 @@ percid=as.numeric(args[3])
 overlap=as.numeric(args[4])
 nind=as.numeric(args[5])
 fasta=args[6]
-
+blastdb=args[7]
+Taxonomizer=args[8]
 
 blastn<-read.table(file, h=F, fill=T)
 
-
 names(blastn)<-c("qseqid", "qlen", "sseqid", "slen", "length",
                  "pident","evalue","nident","mismatch", "gapopen",
                  "qstart", "qend", "qseq", "score", "sstart", 
                  "send", "sseq", "sstrand")
 
-### report
-
 print(paste("Number of total hit: ", nrow(blastn)))
-
 print(paste("Number of query:", length(unique(blastn$qseqid))))
-
-print("Species range (row):")
-      
-print(table(blastn$species))
-
-### Filters
+print("Filtering based on hit caracteristics")
 
 blastn<-blastn[blastn$evalue<=eval,]
-
 print(paste("Number of hits after evalue filter: ", nrow(blastn)))
 
 blastn<-blastn[blastn$pident>=percid*100,]
-
 print(paste("Number of hits after perc id filter: ", nrow(blastn)))
 
-blastn<-blastn[blastn$nident/blastn$qlen>=overlap,]
-
-print(paste("Number of hits after perc id filter: ", nrow(blastn)))
-
-#blastn<-blastn[blastn$species!="",]
-
-
+blastn<-blastn[blastn$slen/as.numeric(blastn$qlen)>=overlap,]
 print(paste("Number of hits after unidentified removal: ", nrow(blastn)))
 
-##### Get species id
-blastn$realid<-sapply(as.character(blastn$sseqid), function(x) strsplit(x, ".", fixed=TRUE)[[1]][1])
-
-## Use an reutils?
-library(reutils)
+##### Reformate subject id
+blastn$realid<-sapply(as.character(blastn$sseqid), function(x) paste0(
+    strsplit(x, ".", fixed=TRUE)[[1]][1], ".",
+    strsplit(x, ".", fixed=TRUE)[[1]][2]))
 
+#### Keep one sequence per subject id
+selecthit<-function(subjectid){
+    tmp<-blastn[blastn$sseqid==subjectid,]
+    return(tmp[order(tmp$evalue),][order(tmp$length),][1,])
+}
+blastn<-as.data.frame(t(sapply(unique(blastn$sseqid), selecthit)))
 
-uid<-blastn$realid[1]
-tmp<-efetch(uid, db = "nuccore", rettype="gb")
+print(paste("Keep best hit for each subject, nrow = ", nrow(blastn)))
 
-### Use Taxonomizr
+### Use Taxonomizr to get species names
 #install.packages("taxonomizr")
 library(taxonomizr)
 
-prepareDatabase("accessionTaxa.sql")
-taxaId<-accessionToTaxa(uid,"accessionTaxa.sql")
+#prepareDatabase("/home/mcariou/2021_legio/accessionTaxa.sql")
+blastn$realid<-unlist(blastn$realid)
+#taxaId<-accessionToTaxa(blastn$realid,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
+taxaId<-accessionToTaxa(blastn$realid, Taxonomizer)
 
+#Taxa<-getTaxonomy(taxaId,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
+Taxa<-getTaxonomy(taxaId,Taxonomizer)
 
-Taxa<-getTaxonomy(taxaId,'accessionTaxa.sql')
+print(paste0("Species names retrieved; dim blastn et taxa: ", dim(blastn), ", ",dim(Taxa)))
 
-head(Taxa)
+blastn<-cbind(blastn, Taxa)
 
-dim(blastn)
-dim(Taxa)
+#### Keep one sequence per subject id
+selecthit<-function(species){
+    tmp<-blastn[blastn$specie==species,]
+    tmp[order(tmp$evalue),][order(tmp$length),]
+    return(tmp$sseqid[1])
+}
 
-blastn<-cbind(blastn, Taxa)
+blastn$evalue<-unlist(blastn$evalue)
+blastn$length<-unlist(blastn$length)
+listid<-na.omit(unlist(sapply(unique(blastn$species)[unique(blastn$species)!=""], function(x) selecthit(x))))
 
 
+blastn$sseqid<-unlist(blastn$sseqid)
 
-#getsp<-function(id){
-#    cmd=paste0("efetch -db nuccore -id ", id, " -format docsum | xtract -pattern DocumentSummary -element Organism | sed 's/ /_/g'")
-#    species<-system(cmd)
-#    return(species)
-#    }
+blastn<-blastn[blastn$sseqid %in% listid,]
+print(paste("Keep best hit for each species, nrow = ", nrow(blastn)))
 
-#species<-sapply(blastn$realid, function(x) getsp(x))
+listid_short<-sapply(listid, function(x) substr(x, 1, nchar(x)-2))
+#### Retrieve sequences via blastcmd
+system(paste0("rm ", fasta))
+for (seq in listid_short){ 
+cmd<-paste0("blastdbcmd -db ", blastdb, " -entry ",  seq, " >> ", fasta)
+system(cmd)
+}
 
-#### Garder 1 séquence par espèce
 
+#### Change names in the fasta
+library(ape)
 
+aln<-read.dna(fasta, format="fasta")
 
+names(listid_short)<-gsub(names(listid_short), pattern=" ", replacement="_")
 
+names(aln)<-sapply(names(aln), function(x) strsplit(x, " ")[[1]][1])
 
 
+newnames<-sapply(names(aln), function(x){
+ names(listid_short)[listid_short==x]
+})
 
+names(aln)<-newnames
 
-#### Now select 1 sequence per species
-#species<-"Legionella_pneumophila_subsp._pneumophila_str._Philadelphia_1"
+fasta_aln<-gsub(fasta, pattern=".fasta", replacement="_aln.fasta", fixed=TRUE)
 
-#selecthit<-function(species){
-#    tmp<-blastn[blastn$specie==species,]
-#    tmp[order(tmp$evalue),][order(tmp$length),]
-#    return(tmp$sseqid[1])
-#}
+write.dna(aln, file=fasta_aln, format="fasta", nbcol = 6, colsep = "", colw = 10)
 
-#listid<-sapply(unique(blastn$species)[unique(blastn$species)!=""], function(x) selecthit(x))
-#listid2<-sapply(as.character(listid), function(x) substr(x, 1, nchar(x)-2))
 
-#for (seq in listid2){ 
-#cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ",  seq, " >> ", fasta)
-#system(cmd)
-#}
 
 
 
diff --git a/script/4_parse_PSIblast.sh b/script/4_parse_PSIblast.sh
index 1e3b0998c6c845140d8d74d090bea8d9fe2351e0..46d22624287aae488d64b2541f84dbabdc6aa1e0 100755
--- a/script/4_parse_PSIblast.sh
+++ b/script/4_parse_PSIblast.sh
@@ -21,6 +21,9 @@ echo "--------------------------------------------------"
 module load R
 
 
+BLASTDBNUC="~/2021_legio/blastdb/phyloref/phyloref_nuc" 
+TAXO="/home/mcariou/2021_legio/Taxo/accessionTaxa.sql
+
 # variable
 OUT_BLAST=$1
 TAB_SP=$2
diff --git a/script/4_parse_PSIblastTMP.R b/script/4_parse_PSIblastTMP.R
index 68c90c714e29cf95a578a7d11ad86b1de90a60a1..a9f47ab2b1424144cf31aa886aa43df3b7d6fc35 100644
--- a/script/4_parse_PSIblastTMP.R
+++ b/script/4_parse_PSIblastTMP.R
@@ -8,7 +8,7 @@ if (length(args)!=6) {
 
 #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
 #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
-#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta")
+#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta", "~/2021_legio/blastdb/phyloref/phyloref_nuc", "/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
 
 print("reading arguments and data...")
 
@@ -18,7 +18,8 @@ percid=as.numeric(args[3])
 overlap=as.numeric(args[4])
 nind=as.numeric(args[5])
 fasta=args[6]
-
+blastdb=args[7]
+Taxonomizer=args[8]
 
 blastn<-read.table(file, h=F, fill=T)
 
@@ -60,9 +61,12 @@ library(taxonomizr)
 
 #prepareDatabase("/home/mcariou/2021_legio/accessionTaxa.sql")
 blastn$realid<-unlist(blastn$realid)
-taxaId<-accessionToTaxa(blastn$realid,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
+#taxaId<-accessionToTaxa(blastn$realid,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
+taxaId<-accessionToTaxa(blastn$realid, Taxonomizer)
+
+#Taxa<-getTaxonomy(taxaId,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
+Taxa<-getTaxonomy(taxaId,Taxonomizer)
 
-Taxa<-getTaxonomy(taxaId,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
 print(paste0("Species names retrieved; dim blastn et taxa: ", dim(blastn), ", ",dim(Taxa)))
 
 blastn<-cbind(blastn, Taxa)
@@ -88,7 +92,7 @@ listid_short<-sapply(listid, function(x) substr(x, 1, nchar(x)-2))
 #### Retrieve sequences via blastcmd
 system(paste0("rm ", fasta))
 for (seq in listid_short){ 
-cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ",  seq, " >> ", fasta)
+cmd<-paste0("blastdbcmd -db ", blastdb, " -entry ",  seq, " >> ", fasta)
 system(cmd)
 }
 
diff --git a/script/4_parse_PSIblastTMP.sh b/script/4_parse_PSIblastTMP.sh
index b1367785a41795326a10951d4f5ab54840369656..aa99b1ebc296521d7e4081281c94435b45d7c5a3 100755
--- a/script/4_parse_PSIblastTMP.sh
+++ b/script/4_parse_PSIblastTMP.sh
@@ -20,6 +20,9 @@ echo "--------------------------------------------------"
 ##################################################################################################################
 module load R
 
+BLASTDBNUC="~/2021_legio/blastdb/phyloref/phyloref_nuc" 
+TAXO="/home/mcariou/2021_legio/Taxo/accessionTaxa.sql"
+
 
 # variable
 OUT_BLAST=$1
@@ -58,7 +61,7 @@ echo "sequences already retrieved"
 echo $FASTA
 else
 echo "retrieve fasta"
-Rscript --vanilla $rscript $OUT_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA > $FASTA_REP/paste_blast_test.log
+Rscript --vanilla $rscript $OUT_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA $BLASTDBNUC $TAXO > $FASTA_REP/paste_blast_test.log
 fi ;