Commit 70125ace authored by mcariou's avatar mcariou
Browse files

modif 4th script

parent 094a9f42
......@@ -8,7 +8,9 @@ if (length(args)!=6) {
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta", "~/2021_legio/blastdb/phyloref/phyloref_nuc", "/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
print("reading arguments and data...")
file<-args[1]
eval=as.numeric(args[2])
......@@ -16,107 +18,106 @@ percid=as.numeric(args[3])
overlap=as.numeric(args[4])
nind=as.numeric(args[5])
fasta=args[6]
blastdb=args[7]
Taxonomizer=args[8]
blastn<-read.table(file, h=F, fill=T)
names(blastn)<-c("qseqid", "qlen", "sseqid", "slen", "length",
"pident","evalue","nident","mismatch", "gapopen",
"qstart", "qend", "qseq", "score", "sstart",
"send", "sseq", "sstrand")
### report
print(paste("Number of total hit: ", nrow(blastn)))
print(paste("Number of query:", length(unique(blastn$qseqid))))
print("Species range (row):")
print(table(blastn$species))
### Filters
print("Filtering based on hit caracteristics")
blastn<-blastn[blastn$evalue<=eval,]
print(paste("Number of hits after evalue filter: ", nrow(blastn)))
blastn<-blastn[blastn$pident>=percid*100,]
print(paste("Number of hits after perc id filter: ", nrow(blastn)))
blastn<-blastn[blastn$nident/blastn$qlen>=overlap,]
print(paste("Number of hits after perc id filter: ", nrow(blastn)))
#blastn<-blastn[blastn$species!="",]
blastn<-blastn[blastn$slen/as.numeric(blastn$qlen)>=overlap,]
print(paste("Number of hits after unidentified removal: ", nrow(blastn)))
##### Get species id
blastn$realid<-sapply(as.character(blastn$sseqid), function(x) strsplit(x, ".", fixed=TRUE)[[1]][1])
## Use an reutils?
library(reutils)
##### Reformate subject id
blastn$realid<-sapply(as.character(blastn$sseqid), function(x) paste0(
strsplit(x, ".", fixed=TRUE)[[1]][1], ".",
strsplit(x, ".", fixed=TRUE)[[1]][2]))
#### Keep one sequence per subject id
selecthit<-function(subjectid){
tmp<-blastn[blastn$sseqid==subjectid,]
return(tmp[order(tmp$evalue),][order(tmp$length),][1,])
}
blastn<-as.data.frame(t(sapply(unique(blastn$sseqid), selecthit)))
uid<-blastn$realid[1]
tmp<-efetch(uid, db = "nuccore", rettype="gb")
print(paste("Keep best hit for each subject, nrow = ", nrow(blastn)))
### Use Taxonomizr
### Use Taxonomizr to get species names
#install.packages("taxonomizr")
library(taxonomizr)
prepareDatabase("accessionTaxa.sql")
taxaId<-accessionToTaxa(uid,"accessionTaxa.sql")
#prepareDatabase("/home/mcariou/2021_legio/accessionTaxa.sql")
blastn$realid<-unlist(blastn$realid)
#taxaId<-accessionToTaxa(blastn$realid,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
taxaId<-accessionToTaxa(blastn$realid, Taxonomizer)
#Taxa<-getTaxonomy(taxaId,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
Taxa<-getTaxonomy(taxaId,Taxonomizer)
Taxa<-getTaxonomy(taxaId,'accessionTaxa.sql')
print(paste0("Species names retrieved; dim blastn et taxa: ", dim(blastn), ", ",dim(Taxa)))
head(Taxa)
blastn<-cbind(blastn, Taxa)
dim(blastn)
dim(Taxa)
#### Keep one sequence per subject id
selecthit<-function(species){
tmp<-blastn[blastn$specie==species,]
tmp[order(tmp$evalue),][order(tmp$length),]
return(tmp$sseqid[1])
}
blastn<-cbind(blastn, Taxa)
blastn$evalue<-unlist(blastn$evalue)
blastn$length<-unlist(blastn$length)
listid<-na.omit(unlist(sapply(unique(blastn$species)[unique(blastn$species)!=""], function(x) selecthit(x))))
blastn$sseqid<-unlist(blastn$sseqid)
#getsp<-function(id){
# cmd=paste0("efetch -db nuccore -id ", id, " -format docsum | xtract -pattern DocumentSummary -element Organism | sed 's/ /_/g'")
# species<-system(cmd)
# return(species)
# }
blastn<-blastn[blastn$sseqid %in% listid,]
print(paste("Keep best hit for each species, nrow = ", nrow(blastn)))
#species<-sapply(blastn$realid, function(x) getsp(x))
listid_short<-sapply(listid, function(x) substr(x, 1, nchar(x)-2))
#### Retrieve sequences via blastcmd
system(paste0("rm ", fasta))
for (seq in listid_short){
cmd<-paste0("blastdbcmd -db ", blastdb, " -entry ", seq, " >> ", fasta)
system(cmd)
}
#### Garder 1 séquence par espèce
#### Change names in the fasta
library(ape)
aln<-read.dna(fasta, format="fasta")
names(listid_short)<-gsub(names(listid_short), pattern=" ", replacement="_")
names(aln)<-sapply(names(aln), function(x) strsplit(x, " ")[[1]][1])
newnames<-sapply(names(aln), function(x){
names(listid_short)[listid_short==x]
})
names(aln)<-newnames
#### Now select 1 sequence per species
#species<-"Legionella_pneumophila_subsp._pneumophila_str._Philadelphia_1"
fasta_aln<-gsub(fasta, pattern=".fasta", replacement="_aln.fasta", fixed=TRUE)
#selecthit<-function(species){
# tmp<-blastn[blastn$specie==species,]
# tmp[order(tmp$evalue),][order(tmp$length),]
# return(tmp$sseqid[1])
#}
write.dna(aln, file=fasta_aln, format="fasta", nbcol = 6, colsep = "", colw = 10)
#listid<-sapply(unique(blastn$species)[unique(blastn$species)!=""], function(x) selecthit(x))
#listid2<-sapply(as.character(listid), function(x) substr(x, 1, nchar(x)-2))
#for (seq in listid2){
#cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ", seq, " >> ", fasta)
#system(cmd)
#}
......@@ -21,6 +21,9 @@ echo "--------------------------------------------------"
module load R
BLASTDBNUC="~/2021_legio/blastdb/phyloref/phyloref_nuc"
TAXO="/home/mcariou/2021_legio/Taxo/accessionTaxa.sql
# variable
OUT_BLAST=$1
TAB_SP=$2
......
......@@ -8,7 +8,7 @@ if (length(args)!=6) {
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta")
#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta", "~/2021_legio/blastdb/phyloref/phyloref_nuc", "/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
print("reading arguments and data...")
......@@ -18,7 +18,8 @@ percid=as.numeric(args[3])
overlap=as.numeric(args[4])
nind=as.numeric(args[5])
fasta=args[6]
blastdb=args[7]
Taxonomizer=args[8]
blastn<-read.table(file, h=F, fill=T)
......@@ -60,9 +61,12 @@ library(taxonomizr)
#prepareDatabase("/home/mcariou/2021_legio/accessionTaxa.sql")
blastn$realid<-unlist(blastn$realid)
taxaId<-accessionToTaxa(blastn$realid,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
#taxaId<-accessionToTaxa(blastn$realid,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
taxaId<-accessionToTaxa(blastn$realid, Taxonomizer)
#Taxa<-getTaxonomy(taxaId,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
Taxa<-getTaxonomy(taxaId,Taxonomizer)
Taxa<-getTaxonomy(taxaId,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
print(paste0("Species names retrieved; dim blastn et taxa: ", dim(blastn), ", ",dim(Taxa)))
blastn<-cbind(blastn, Taxa)
......@@ -88,7 +92,7 @@ listid_short<-sapply(listid, function(x) substr(x, 1, nchar(x)-2))
#### Retrieve sequences via blastcmd
system(paste0("rm ", fasta))
for (seq in listid_short){
cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ", seq, " >> ", fasta)
cmd<-paste0("blastdbcmd -db ", blastdb, " -entry ", seq, " >> ", fasta)
system(cmd)
}
......
......@@ -20,6 +20,9 @@ echo "--------------------------------------------------"
##################################################################################################################
module load R
BLASTDBNUC="~/2021_legio/blastdb/phyloref/phyloref_nuc"
TAXO="/home/mcariou/2021_legio/Taxo/accessionTaxa.sql"
# variable
OUT_BLAST=$1
......@@ -58,7 +61,7 @@ echo "sequences already retrieved"
echo $FASTA
else
echo "retrieve fasta"
Rscript --vanilla $rscript $OUT_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA > $FASTA_REP/paste_blast_test.log
Rscript --vanilla $rscript $OUT_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA $BLASTDBNUC $TAXO > $FASTA_REP/paste_blast_test.log
fi ;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment