Commit 63ee1143 authored by mcariou's avatar mcariou
Browse files

update

parent 07473fe9
This diff is collapsed.
......@@ -8,8 +8,8 @@ echo "--------------------------------------------------"
#./make_blast.sh /home/adminmarie/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/blastdb/cdslegio /home/adminmarie/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/genes/lp0952.fasta /home/adminmarie/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn
### PSMN
#./make_blast.sh /Xnfs/ciridb/shared_data/Databases/legio_cds/blastdb_ncbi/cdslegio ~/2021_legio/genes/lp0952.fasta ~/2021_legio/out_blastn
#./make_blast.sh ~/2021_legio/blastdb/phyloref/phyloref ~/2021_legio/genes/78Lp_uniprot.fasta ~/2021_legio/out_blastn
#./3_make_PSIblast.sh /Xnfs/ciridb/shared_data/Databases/legio_cds/blastdb_ncbi/cdslegio ~/2021_legio/genes/lp0952.fasta ~/2021_legio/out_blastn
#./3_make_PSIblast.sh ~/2021_legio/blastdb/phyloref/phyloref_prot ~/2021_legio/genes/78Lp_uniprot.fasta ~/2021_legio/out_blastn
##################################################################################################################
# variable
......
......@@ -7,6 +7,8 @@ if (length(args)!=6) {
}
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
file<-args[1]
eval=as.numeric(args[2])
......@@ -22,9 +24,7 @@ blastn<-read.table(file, h=F, fill=T)
names(blastn)<-c("qseqid", "qlen", "sseqid", "slen", "length",
"pident","evalue","nident","mismatch", "gapopen",
"qstart", "qend", "qseq", "score", "sstart",
"send", "sseq", "sstrand", "species")
"send", "sseq", "sstrand")
### report
......@@ -55,25 +55,68 @@ print(paste("Number of hits after perc id filter: ", nrow(blastn)))
print(paste("Number of hits after unidentified removal: ", nrow(blastn)))
##### Get species id
blastn$realid<-sapply(as.character(blastn$sseqid), function(x) strsplit(x, ".", fixed=TRUE)[[1]][1])
## Use an reutils?
library(reutils)
uid<-blastn$realid[1]
tmp<-efetch(uid, db = "nuccore", rettype="gb")
### Use Taxonomizr
#install.packages("taxonomizr")
library(taxonomizr)
prepareDatabase("accessionTaxa.sql")
taxaId<-accessionToTaxa(uid,"accessionTaxa.sql")
Taxa<-getTaxonomy(taxaId,'accessionTaxa.sql')
head(Taxa)
dim(blastn)
dim(Taxa)
blastn<-cbind(blastn, Taxa)
#getsp<-function(id){
# cmd=paste0("efetch -db nuccore -id ", id, " -format docsum | xtract -pattern DocumentSummary -element Organism | sed 's/ /_/g'")
# species<-system(cmd)
# return(species)
# }
#species<-sapply(blastn$realid, function(x) getsp(x))
#### Garder 1 séquence par espèce
species<-"Legionella_pneumophila_subsp._pneumophila_str._Philadelphia_1"
selecthit<-function(species){
tmp<-blastn[blastn$specie==species,]
tmp[order(tmp$evalue),][order(tmp$length),]
return(tmp$sseqid[1])
}
listid<-sapply(unique(blastn$species)[unique(blastn$species)!=""], function(x) selecthit(x))
listid2<-sapply(as.character(listid), function(x) substr(x, 1, nchar(x)-2))
for (seq in listid2){
cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ", seq, " >> ", fasta)
system(cmd)
}
#### Now select 1 sequence per species
#species<-"Legionella_pneumophila_subsp._pneumophila_str._Philadelphia_1"
#selecthit<-function(species){
# tmp<-blastn[blastn$specie==species,]
# tmp[order(tmp$evalue),][order(tmp$length),]
# return(tmp$sseqid[1])
#}
#listid<-sapply(unique(blastn$species)[unique(blastn$species)!=""], function(x) selecthit(x))
#listid2<-sapply(as.character(listid), function(x) substr(x, 1, nchar(x)-2))
#for (seq in listid2){
#cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ", seq, " >> ", fasta)
#system(cmd)
#}
......@@ -18,6 +18,8 @@ echo "--------------------------------------------------"
### PSMN
#./4_parse_blast.sh ~/2021_legio/out_blastn/78Lp_uniprot.psiblast ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/78Lp ~/2021_legio/genes/78Lp_uniprot.fasta 0.0001 0.3 0.5 3
##################################################################################################################
module load R
# variable
OUT_BLAST=$1
......@@ -27,7 +29,9 @@ BLAST_REP=`echo $1 | awk -F "/" '{$NF=""; print $0}' | sed 's/ /\//g'`
FASTA_REP=$3
mkdir -p $FASTA_REP
GENES=`grep ">" $4 | sed 's/>//g'`
#GENES=`grep ">" $4 | sed 's/>//g'`
GENES="lpg2300"
EVAL=$5
PERCID=$6
......@@ -49,30 +53,41 @@ subblast=$FASTA_REP"/"$Gene"/"$Gene".psiblast"
grep $Gene $OUT_BLAST > $subblast
FASTA=$FASTA_REP"/"$Gene"/"$Gene".fasta"
nrow=`cat $subblast | wc -l`
if [[ -s $subblast.mod ]] ; then
echo "sp names already retrieved"
else
echo "debut de la boucle:"
for id in `seq $nrow`
do
line=`sed -n ${id}p $subblast`
idncbi=`sed -n ${id}p $subblast | awk '{print $3}' | awk -F[\.] '{print $1}'`
tax=`efetch -db nuccore -id $idncbi -format docsum | xtract -pattern DocumentSummary -element Organism | sed 's/ /_/g'`
echo $tax
echo $line " " $tax >> $subblast.mod
done
fi ;
## Filter data first because otherwiase Blast file is too long (it is not necessary to retrieve organism for every line)
nrow=`cat $subblast | wc -l`
if [[ -s $FASTA ]] ; then
echo "sequences already retrieved"
else
echo "retrieve fasta"
Rscript --vanilla $rscript $subblast.mod $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA> $FASTA_REP/$Gene/paste_blast.log
Rscript --vanilla $rscript $subblast $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA> $FASTA_REP/$Gene/paste_blast_test.log
fi ;
# if [[ -s $subblast.mod ]] ; then
# echo "sp names already retrieved"
# else
# echo "debut de la boucle:"
# for id in `seq $nrow`
# do
# line=`sed -n ${id}p $subblast`
# idncbi=`sed -n ${id}p $subblast | awk '{print $3}' | awk -F[\.] '{print $1}'`
# tax=`efetch -db nuccore -id $idncbi -format docsum | xtract -pattern DocumentSummary -element Organism | sed 's/ /_/g'`
# echo $tax
# echo $line " " $tax >> $subblast.mod
# done
# fi ;
#if [[ -s $FASTA ]] ; then
#echo "sequences already retrieved"
#else
#echo "retrieve fasta"
#Rscript --vanilla $rscript $subblast.mod $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA> $FASTA_REP/$Gene/paste_blast.log
#fi ;
done
......
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
# test if there is at least one argument: if not, return an error
if (length(args)!=6) {
stop("5 arguments must be supplied.\n", call.=FALSE)
}
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta")
print("reading arguments and data...")
file<-args[1]
eval=as.numeric(args[2])
percid=as.numeric(args[3])
overlap=as.numeric(args[4])
nind=as.numeric(args[5])
fasta=args[6]
blastn<-read.table(file, h=F, fill=T)
names(blastn)<-c("qseqid", "qlen", "sseqid", "slen", "length",
"pident","evalue","nident","mismatch", "gapopen",
"qstart", "qend", "qseq", "score", "sstart",
"send", "sseq", "sstrand")
print(paste("Number of total hit: ", nrow(blastn)))
print(paste("Number of query:", length(unique(blastn$qseqid))))
print("Filtering based on hit caracteristics")
blastn<-blastn[blastn$evalue<=eval,]
print(paste("Number of hits after evalue filter: ", nrow(blastn)))
blastn<-blastn[blastn$pident>=percid*100,]
print(paste("Number of hits after perc id filter: ", nrow(blastn)))
blastn<-blastn[blastn$slen/as.numeric(blastn$qlen)>=overlap,]
print(paste("Number of hits after unidentified removal: ", nrow(blastn)))
##### Reformate subject id
blastn$realid<-sapply(as.character(blastn$sseqid), function(x) paste0(
strsplit(x, ".", fixed=TRUE)[[1]][1], ".",
strsplit(x, ".", fixed=TRUE)[[1]][2]))
#### Keep one sequence per subject id
selecthit<-function(subjectid){
tmp<-blastn[blastn$sseqid==subjectid,]
return(tmp[order(tmp$evalue),][order(tmp$length),][1,])
}
blastn<-as.data.frame(t(sapply(unique(blastn$sseqid), selecthit)))
print(paste("Keep best hit for each subject, nrow = ", nrow(blastn)))
### Use Taxonomizr to get species names
#install.packages("taxonomizr")
library(taxonomizr)
#prepareDatabase("/home/mcariou/2021_legio/accessionTaxa.sql")
blastn$realid<-unlist(blastn$realid)
taxaId<-accessionToTaxa(blastn$realid,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
Taxa<-getTaxonomy(taxaId,"/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
print(paste0("Species names retrieved; dim blastn et taxa: ", dim(blastn), ", ",dim(Taxa)))
blastn<-cbind(blastn, Taxa)
#### Keep one sequence per subject id
selecthit<-function(species){
tmp<-blastn[blastn$specie==species,]
tmp[order(tmp$evalue),][order(tmp$length),]
return(tmp$sseqid[1])
}
blastn$evalue<-unlist(blastn$evalue)
blastn$length<-unlist(blastn$length)
listid<-na.omit(unlist(sapply(unique(blastn$species)[unique(blastn$species)!=""], function(x) selecthit(x))))
blastn$sseqid<-unlist(blastn$sseqid)
blastn<-blastn[blastn$sseqid %in% listid,]
print(paste("Keep best hit for each species, nrow = ", nrow(blastn)))
#### Retrieve sequences via blastcmd
for (seq in listid2){
cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ", seq, " >> ", fasta)
system(cmd)
}
#### Change names in the fasta
#!/bin/bash
echo "USAGE: ./4_parse_PSIblast.sh \$1=blast_out_file
\$2=tabgenome \$3=fasta_out_rep
\$4=seuil_evalue \$5=percid \$6=percoverlap
\$7=maxseqpertax"
echo "--------------------------------------------------"
##################################################################################################################
### En local
#./4_parse_blast.sh ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn/lp0952.blastn \
#~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn/lp0952_sp.blastn \
#~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/fasta/lp0952_ortho.fasta
#0.0001 0.3 0.5 10
#./4_parse_blast.sh ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn/lp0952.blastn ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn/lp0952_sp.blastn ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/fasta/lp0952_ortho.fasta 0.0001 0.3 0.5 10
### PSMN
#./4_parse_blast.sh ~/2021_legio/out_blastn/78Lp_uniprot.psiblast ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/78Lp ~/2021_legio/genes/78Lp_uniprot.fasta 0.0001 0.3 0.5 3
##################################################################################################################
module load R
# variable
OUT_BLAST=$1
TAB_SP=$2
BLAST_REP=`echo $1 | awk -F "/" '{$NF=""; print $0}' | sed 's/ /\//g'`
FASTA_REP=$3
mkdir -p $FASTA_REP
#GENES=`grep ">" $4 | sed 's/>//g'`
Genes="lpg2300"
EVAL=$5
PERCID=$6
PERCOVERLAP=$7
NPERTAX=$8
echo $0
rscript=`ls $0 | sed 's/.sh/.R/g'`
echo $rscript
#################################################################################################################
# Read fasta query, to write proper aln.
mkdir -p $FASTA_REP
echo $Gene
FASTA=$FASTA_REP"/"$Gene".fasta"
## Filter data first because otherwiase Blast file is too long (it is not necessary to retrieve organism for every line)
echo `wc -l $subblast`
if [[ -s $FASTA ]] ; then
echo "sequences already retrieved"
else
echo "retrieve fasta"
Rscript --vanilla $rscript $OUT_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA > $FASTA_REP/paste_blast_test.log
fi ;
# fin
......@@ -11,7 +11,7 @@
#$ -o /home/mcariou/2021_legio/log/
#$ -e /home/mcariou/2021_legio/log/
./4_parse_PSIblast.sh ~/2021_legio/out_blastn/78Lp_uniprot.psiblast ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/78Lp ~/2021_legio/genes/78Lp_uniprot.fasta 0.0001 0.3 0.5 1
./4_parse_PSIblastTMP.sh ~/2021_legio/out_blastn/lpg2300.psiblast ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/lpg2300 ~/2021_legio/genes/lpg2300.fasta 0.0001 0.1 0.1 1
......
2.6.0+
1
Q5ZXP3_lpg0688
/home/mcariou/2021_legio/blastdb/phyloref/phyloref_prot
query
hits
AE017354.1.p373_1
CR628337.1.p362_1
FQ958210.1.p381_1
CP001828.1.p378_1
CR628336.1.p382_1
CP000675.2.p381_1
JNCF01000046.1.p5_1
LNZB01000060.1.p91_1
LNYR01000034.1.p54_1
LNZC01000001.1.p18_1
LN614827.1.p493_1
LNYN01000042.1.p56_1
LNYW01000067.1.p7_1
LNXT01000048.1.p93_1
LNXX01000004.1.p12_1
LNYE01000009.1.p14_1
LNYV01000015.1.p23_1
LNYS01000001.1.p20_1
CAAAIM010000011.1.p14_1
FN650140.1.p448_1
LNYU01000032.1.p28_1
LN901321.1.p48_1
JNIA01000024.1.p18_1
LNYZ01000029.1.p9_1
CAAAID010000002.1.p45_1
LNYP01000029.1.p77_1
LNXW01000008.1.p28_1
CAAAIA010000014.1.p9_1
JH413817.1.p18_1
LNYY01000021.1.p116_1
CAAAIW010000005.1.p16_1
LNYH01000058.1.p13_1
LNXV01000005.1.p26_1
LNYL01000026.1.p4_1
LNXS01000029.1.p73_1
LNXU01000019.1.p78_1
LNYF01000001.1.p62_1
LNYI01000048.1.p5_1
LNYK01000014.1.p72_1
CP016397.1.p355_1
LNYQ01000013.1.p281_1
LNYX01000032.1.p21_1
LNYJ01000011.1.p312_1
LNZA01000001.1.p292_1
LNYG01000001.1.p42_1
JHYC01000004.1.p26_1
UGNV01000001.1.p343_1
LNYB01000004.1.p11_1
CALJ01000113.1.p1_1
CCVW01000002.1.p114_1
LNYO01000024.1.p100_1
LNYT01000007.1.p118_1
LNYA01000001.1.p35_1
QCXN01000001.1.p67_1
LNXY01000027.1.p96_1
LNKA01000001.1.p167_1
CP021497.1.p53_1
LNYC01000044.1.p19_1
CP013667.1.p163_1
LNYQ01000012.1.p129_1
LNYY01000019.1.p372_1
CCVW01000002.1.p2_1
LN614827.1.p572_1
LNYN01000020.1.p80_1
LNYO01000013.1.p333_1
2.6.0+
2
Q5ZXP3_lpg0688
/home/mcariou/2021_legio/blastdb/phyloref/phyloref_prot
query
hits
AE017354.1.p373_1
CR628337.1.p362_1
FQ958210.1.p381_1
CP001828.1.p378_1
CR628336.1.p382_1
JNCF01000046.1.p5_1
CP000675.2.p381_1
LNYR01000034.1.p54_1
LNZC01000001.1.p18_1
LNYN01000042.1.p56_1
LNZB01000060.1.p91_1
LN614827.1.p493_1
CAAAID010000002.1.p45_1
LNYP01000029.1.p77_1
LNXV01000005.1.p26_1
LNYK01000014.1.p72_1
LNYH01000058.1.p13_1
LNXT01000048.1.p93_1
LNYS01000001.1.p20_1
CP016397.1.p355_1
LNYL01000026.1.p4_1
LNYF01000001.1.p62_1
LNYG01000001.1.p42_1
JHYC01000004.1.p26_1
UGNV01000001.1.p343_1
LNYI01000048.1.p5_1
LNKA01000001.1.p167_1
CAAAIA010000014.1.p9_1
LNYJ01000011.1.p312_1
LNYX01000032.1.p21_1
CAAAIW010000005.1.p16_1
CAAAIM010000011.1.p14_1
LNXX01000004.1.p12_1
LNYZ01000029.1.p9_1
LNYB01000004.1.p11_1
CALJ01000113.1.p1_1
LNYW01000067.1.p7_1
JH413817.1.p18_1
LNYU01000032.1.p28_1
LNYV01000015.1.p23_1
FN650140.1.p448_1
LNYE01000009.1.p14_1
LN901321.1.p48_1
LNYY01000021.1.p116_1
LNXW01000008.1.p28_1
JNIA01000024.1.p18_1
LNXS01000029.1.p73_1
CCVW01000002.1.p114_1
LNYT01000007.1.p118_1
LNXU01000019.1.p78_1
LNZA01000001.1.p292_1
LNYA01000001.1.p35_1
LNYO01000024.1.p100_1
QCXN01000001.1.p67_1
LNXY01000027.1.p96_1
LNYQ01000013.1.p281_1
LNYC01000044.1.p19_1
CP021497.1.p53_1
CP013667.1.p163_1
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment