Commit 1fa79c25 authored by mcariou's avatar mcariou
Browse files

correct script 4

parent 2eb685ac
......@@ -176,6 +176,17 @@ run prank, trimal and phyml on the fasta output from step 4.
## 6. To do
16/11/2021
Les script 4 mache sur lpg2300 et récupère 49 séquences
--> checker quelles espèces sont présentes et cohérence avec résultats de Margaux + quels sont les espèces manquantes
--> Faire en sorte que ce script marche aussi sur plusieurs gènes
- Adapt the pipeline to use more than one gene, make concatenates, and format presence/absence for all genes.
=> make script 3_run_PSIblast.sh
to blast all 78 genes retrieved from uniprot on the specific database designed for phylogeny.
......
......@@ -8,7 +8,7 @@ if (length(args)!=6) {
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta")
#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta")
print("reading arguments and data...")
......@@ -84,17 +84,34 @@ blastn$sseqid<-unlist(blastn$sseqid)
blastn<-blastn[blastn$sseqid %in% listid,]
print(paste("Keep best hit for each species, nrow = ", nrow(blastn)))
listid_short<-sapply(listid, function(x) substr(x, 1, nchar(x)-2))
#### Retrieve sequences via blastcmd
for (seq in listid2){
system(paste0("rm ", fasta))
for (seq in listid_short){
cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ", seq, " >> ", fasta)
system(cmd)
}
#### Change names in the fasta
library(ape)
aln<-read.dna(fasta, format="fasta")
names(listid_short)<-gsub(names(listid_short), pattern=" ", replacement="_")
names(aln)<-sapply(names(aln), function(x) strsplit(x, " ")[[1]][1])
newnames<-sapply(names(aln), function(x){
names(listid_short)[listid_short==x]
})
names(aln)<-newnames
fasta_aln<-gsub(fasta, pattern=".fasta", replacement="_aln.fasta", fixed=TRUE)
write.dna(aln, file=fasta_aln, format="fasta", nbcol = 6, colsep = "", colw = 10)
......
......@@ -30,7 +30,7 @@ FASTA_REP=$3
mkdir -p $FASTA_REP
#GENES=`grep ">" $4 | sed 's/>//g'`
Genes="lpg2300"
Gene="lpg2300"
EVAL=$5
......@@ -55,6 +55,7 @@ echo `wc -l $subblast`
if [[ -s $FASTA ]] ; then
echo "sequences already retrieved"
echo $FASTA
else
echo "retrieve fasta"
Rscript --vanilla $rscript $OUT_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA > $FASTA_REP/paste_blast_test.log
......
......@@ -10,6 +10,8 @@
## the dirs must exist before job is launched
#$ -o /home/mcariou/2021_legio/log/
#$ -e /home/mcariou/2021_legio/log/
module load R
./4_parse_PSIblastTMP.sh ~/2021_legio/out_blastn/lpg2300.psiblast ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/lpg2300 ~/2021_legio/genes/lpg2300.fasta 0.0001 0.1 0.1 1
......
grep package:base R Documentation
_P_a_t_t_e_r_n _M_a_t_c_h_i_n_g _a_n_d _R_e_p_l_a_c_e_m_e_n_t
_D_e_s_c_r_i_p_t_i_o_n:
‘grep’, ‘grepl’, ‘regexpr’, ‘gregexpr’, ‘regexec’ and ‘gregexec’
search for matches to argument ‘pattern’ within each element of a
character vector: they differ in the format of and amount of
detail in the results.
‘sub’ and ‘gsub’ perform replacement of the first and all matches
respectively.
_U_s_a_g_e:
grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE,
fixed = FALSE, useBytes = FALSE, invert = FALSE)
grepl(pattern, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
regexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
regexec(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
gregexec(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
_A_r_g_u_m_e_n_t_s:
pattern: character string containing a regular expression (or
character string for ‘fixed = TRUE’) to be matched in the
given character vector. Coerced by ‘as.character’ to a
character string if possible. If a character vector of
length 2 or more is supplied, the first element is used with
a warning. Missing values are allowed except for ‘regexpr’,
‘gregexpr’ and ‘regexec’.
x, text: a character vector where matches are sought, or an object
which can be coerced by ‘as.character’ to a character vector.
Long vectors are supported.
ignore.case: if ‘FALSE’, the pattern matching is _case sensitive_ and
if ‘TRUE’, case is ignored during matching.
perl: logical. Should Perl-compatible regexps be used?
value: if ‘FALSE’, a vector containing the (‘integer’) indices of
the matches determined by ‘grep’ is returned, and if ‘TRUE’,
a vector containing the matching elements themselves is
returned.
fixed: logical. If ‘TRUE’, ‘pattern’ is a string to be matched as
is. Overrides all conflicting arguments.
useBytes: logical. If ‘TRUE’ the matching is done byte-by-byte rather
than character-by-character. See ‘Details’.
invert: logical. If ‘TRUE’ return indices or values for elements
that do _not_ match.
replacement: a replacement for matched pattern in ‘sub’ and ‘gsub’.
Coerced to character if possible. For ‘fixed = FALSE’ this
can include backreferences ‘"\1"’ to ‘"\9"’ to parenthesized
subexpressions of ‘pattern’. For ‘perl = TRUE’ only, it can
also contain ‘"\U"’ or ‘"\L"’ to convert the rest of the
replacement to upper or lower case and ‘"\E"’ to end case
conversion. If a character vector of length 2 or more is
supplied, the first element is used with a warning. If ‘NA’,
all elements in the result corresponding to matches will be
set to ‘NA’.
_D_e_t_a_i_l_s:
Arguments which should be character strings or character vectors
are coerced to character if possible.
Each of these functions operates in one of three modes:
1. ‘fixed = TRUE’: use exact matching.
2. ‘perl = TRUE’: use Perl-style regular expressions.
3. ‘fixed = FALSE, perl = FALSE’: use POSIX 1003.2 extended
regular expressions (the default).
See the help pages on regular expression for details of the
different types of regular expressions.
The two ‘*sub’ functions differ only in that ‘sub’ replaces only
the first occurrence of a ‘pattern’ whereas ‘gsub’ replaces all
occurrences. If ‘replacement’ contains backreferences which are
not defined in ‘pattern’ the result is undefined (but most often
the backreference is taken to be ‘""’).
For ‘regexpr’, ‘gregexpr’, ‘regexec’ and ‘gregexec’ it is an error
for ‘pattern’ to be ‘NA’, otherwise ‘NA’ is permitted and gives an
‘NA’ match.
Both ‘grep’ and ‘grepl’ take missing values in ‘x’ as not matching
a non-missing ‘pattern’.
The main effect of ‘useBytes = TRUE’ is to avoid errors/warnings
about invalid inputs and spurious matches in multibyte locales,
but for ‘regexpr’ it changes the interpretation of the output. It
inhibits the conversion of inputs with marked encodings, and is
forced if any input is found which is marked as ‘"bytes"’ (see
‘Encoding’).
Caseless matching does not make much sense for bytes in a
multibyte locale, and you should expect it only to work for ASCII
characters if ‘useBytes = TRUE’.
‘regexpr’ and ‘gregexpr’ with ‘perl = TRUE’ allow Python-style
named captures, but not for _long vector_ inputs.
Invalid inputs in the current locale are warned about up to 5
times.
Caseless matching with ‘perl = TRUE’ for non-ASCII characters
depends on the PCRE library being compiled with ‘Unicode property
support’, which PCRE2 is by default.
_V_a_l_u_e:
‘grep(value = FALSE)’ returns a vector of the indices of the
elements of ‘x’ that yielded a match (or not, for ‘invert =
TRUE’). This will be an integer vector unless the input is a
_long vector_, when it will be a double vector.
‘grep(value = TRUE)’ returns a character vector containing the
selected elements of ‘x’ (after coercion, preserving names but no
other attributes).
‘grepl’ returns a logical vector (match or not for each element of
‘x’).
‘sub’ and ‘gsub’ return a character vector of the same length and
with the same attributes as ‘x’ (after possible coercion to
character). Elements of character vectors ‘x’ which are not
substituted will be returned unchanged (including any declared
encoding). If ‘useBytes = FALSE’ a non-ASCII substituted result
will often be in UTF-8 with a marked encoding (e.g., if there is a
UTF-8 input, and in a multibyte locale unless ‘fixed = TRUE’).
Such strings can be re-encoded by ‘enc2native’.
‘regexpr’ returns an integer vector of the same length as ‘text’
giving the starting position of the first match or -1 if there is
none, with attribute ‘"match.length"’, an integer vector giving
the length of the matched text (or -1 for no match). The match
positions and lengths are in characters unless ‘useBytes = TRUE’
is used, when they are in bytes (as they are for ASCII-only
matching: in either case an attribute ‘useBytes’ with value ‘TRUE’
is set on the result). If named capture is used there are further
attributes ‘"capture.start"’, ‘"capture.length"’ and
‘"capture.names"’.
‘gregexpr’ returns a list of the same length as ‘text’ each
element of which is of the same form as the return value for
‘regexpr’, except that the starting positions of every (disjoint)
match are given.
‘regexec’ returns a list of the same length as ‘text’ each element
of which is either -1 if there is no match, or a sequence of
integers with the starting positions of the match and all
substrings corresponding to parenthesized subexpressions of
‘pattern’, with attribute ‘"match.length"’ a vector giving the
lengths of the matches (or -1 for no match). The interpretation
of positions and length and the attributes follows ‘regexpr’.
‘gregexec’ returns the same as ‘regexec’, except that to
accommodate multiple matches per element of ‘text’, the integer
sequences for each match are made into columns of a matrix, with
one matrix per element of ‘text’ with matches.
Where matching failed because of resource limits (especially for
‘perl = TRUE’) this is regarded as a non-match, usually with a
warning.
_W_a_r_n_i_n_g:
The POSIX 1003.2 mode of ‘gsub’ and ‘gregexpr’ does not work
correctly with repeated word-boundaries (e.g., ‘pattern = "\b"’).
Use ‘perl = TRUE’ for such matches (but that may not work as
expected with non-ASCII inputs, as the meaning of ‘word’ is
system-dependent).
_P_e_r_f_o_r_m_a_n_c_e _c_o_n_s_i_d_e_r_a_t_i_o_n_s:
If you are doing a lot of regular expression matching, including
on very long strings, you will want to consider the options used.
Generally ‘perl = TRUE’ will be faster than the default regular
expression engine, and ‘fixed = TRUE’ faster still (especially
when each pattern is matched only a few times).
If you are working in a single-byte locale and have marked UTF-8
strings that are representable in that locale, convert them first
as just one UTF-8 string will force all the matching to be done in
Unicode, which attracts a penalty of around 3x for the default
POSIX 1003.2 mode.
If you can make use of ‘useBytes = TRUE’, the strings will not be
checked before matching, and the actual matching will be faster.
Often byte-based matching suffices in a UTF-8 locale since byte
patterns of one character never match part of another. Character
ranges may produce unexpected results.
PCRE-based matching by default used to put additional effort into
‘studying’ the compiled pattern when ‘x’/‘text’ has length 10 or
more. That study may use the PCRE JIT compiler on platforms where
it is available (see ‘pcre_config’). As from PCRE2 (PCRE version
>= 10.00 as reported by ‘extSoftVersion’), there is no study
phase, but the patterns are optimized automatically when possible,
and PCRE JIT is used when enabled. The details are controlled by
‘options’ ‘PCRE_study’ and ‘PCRE_use_JIT’. (Some timing
comparisons can be seen by running file ‘tests/PCRE.R’ in the R
sources (and perhaps installed).) People working with PCRE and
very long strings can adjust the maximum size of the JIT stack by
setting environment variable ‘R_PCRE_JIT_STACK_MAXSIZE’ before JIT
is used to a value between ‘1’ and ‘1000’ in MB: the default is
‘64’. When JIT is not used with PCRE version < 10.30 (that is with
PCRE1 and old versions of PCRE2), it might also be wise to set the
option ‘PCRE_limit_recursion’.
_N_o_t_e:
Aspects will be platform-dependent as well as local-dependent: for
example the implementation of character classes (except
‘[:digit:]’ and ‘[:xdigit:]’). One can expect results to be
consistent for ASCII inputs and when working in UTF-8 mode (when
most platforms will use Unicode character tables, although those
are updated frequently and subject to some degree of
interpretation - is a circled capital letter alphabetic or a
symbol?). However, results in 8-bit encodings can differ
considerably between platforms, modes and from the UTF-8 versions.
_S_o_u_r_c_e:
The C code for POSIX-style regular expression matching has changed
over the years. As from R 2.10.0 (Oct 2009) the TRE library of
Ville Laurikari (<URL: https://github.com/laurikari/tre>) is used.
The POSIX standard does give some room for interpretation,
especially in the handling of invalid regular expressions and the
collation of character ranges, so the results will have changed
slightly over the years.
For Perl-style matching PCRE2 or PCRE (<URL:
https://www.pcre.org>) is used: again the results may depend
(slightly) on the version of PCRE in use.
_R_e_f_e_r_e_n_c_e_s:
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S
Language_. Wadsworth & Brooks/Cole (‘grep’)
_S_e_e _A_l_s_o:
regular expression (aka ‘regexp’) for the details of the pattern
specification.
‘regmatches’ for extracting matched substrings based on the
results of ‘regexpr’, ‘gregexpr’ and ‘regexec’.
‘glob2rx’ to turn wildcard matches into regular expressions.
‘agrep’ for approximate matching.
‘charmatch’, ‘pmatch’ for partial matching, ‘match’ for matching
to whole strings, ‘startsWith’ for matching of initial parts of
strings.
‘tolower’, ‘toupper’ and ‘chartr’ for character translations.
‘apropos’ uses regexps and has more examples.
‘grepRaw’ for matching raw vectors.
Options ‘PCRE_limit_recursion’, ‘PCRE_study’ and ‘PCRE_use_JIT’.
‘extSoftVersion’ for the versions of regex and PCRE libraries in
use, ‘pcre_config’ for more details for PCRE.
_E_x_a_m_p_l_e_s:
grep("[a-z]", letters)
txt <- c("arm","foot","lefroo", "bafoobar")
if(length(i <- grep("foo", txt)))
cat("'foo' appears at least once in\n\t", txt, "\n")
i # 2 and 4
txt[i]
## Double all 'a' or 'b's; "\" must be escaped, i.e., 'doubled'
gsub("([ab])", "\\1_\\1_", "abc and ABC")
txt <- c("The", "licenses", "for", "most", "software", "are",
"designed", "to", "take", "away", "your", "freedom",
"to", "share", "and", "change", "it.",
"", "By", "contrast,", "the", "GNU", "General", "Public", "License",
"is", "intended", "to", "guarantee", "your", "freedom", "to",
"share", "and", "change", "free", "software", "--",
"to", "make", "sure", "the", "software", "is",
"free", "for", "all", "its", "users")
( i <- grep("[gu]", txt) ) # indices
stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) )
## Note that for some implementations character ranges are
## locale-dependent (but not currently). Then [b-e] in locales such as
## en_US may include B as the collation order is aAbBcCdDe ...
(ot <- sub("[b-e]",".", txt))
txt[ot != gsub("[b-e]",".", txt)]#- gsub does "global" substitution
## In caseless matching, ranges include both cases:
a <- grep("[b-e]", txt, value = TRUE)
b <- grep("[b-e]", txt, ignore.case = TRUE, value = TRUE)
setdiff(b, a)
txt[gsub("g","#", txt) !=
gsub("g","#", txt, ignore.case = TRUE)] # the "G" words
regexpr("en", txt)
gregexpr("e", txt)
## Using grepl() for filtering
## Find functions with argument names matching "warn":
findArgs <- function(env, pattern) {
nms <- ls(envir = as.environment(env))
nms <- nms[is.na(match(nms, c("F","T")))] # <-- work around "checking hack"
aa <- sapply(nms, function(.) { o <- get(.)
if(is.function(o)) names(formals(o)) })
iw <- sapply(aa, function(a) any(grepl(pattern, a, ignore.case=TRUE)))
aa[iw]
}
findArgs("package:base", "warn")
## trim trailing white space
str <- "Now is the time "
sub(" +$", "", str) ## spaces only
## what is considered 'white space' depends on the locale.
sub("[[:space:]]+$", "", str) ## white space, POSIX-style
## what PCRE considered white space changed in version 8.34: see ?regex
sub("\\s+$", "", str, perl = TRUE) ## PCRE-style white space
## capitalizing
txt <- "a test of capitalizing"
gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE)
gsub("\\b(\\w)", "\\U\\1", txt, perl=TRUE)
txt2 <- "useRs may fly into JFK or laGuardia"
gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
## named capture
notables <- c(" Ben Franklin and Jefferson Davis",
"\tMillard Fillmore")
# name groups 'first' and 'last'
name.rex <- "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"
(parsed <- regexpr(name.rex, notables, perl = TRUE))
gregexpr(name.rex, notables, perl = TRUE)[[2]]
parse.one <- function(res, result) {
m <- do.call(rbind, lapply(seq_along(res), function(i) {
if(result[i] == -1) return("")
st <- attr(result, "capture.start")[i, ]
substring(res[i], st, st + attr(result, "capture.length")[i, ] - 1)
}))
colnames(m) <- attr(result, "capture.names")
m
}
parse.one(notables, parsed)
## Decompose a URL into its components.
## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html).
x <- "http://stat.umn.edu:80/xyz"
m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
m
regmatches(x, m)
## Element 3 is the protocol, 4 is the host, 6 is the port, and 7
## is the path. We can use this to make a function for extracting the
## parts of a URL:
URL_parts <- function(x) {
m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
parts <- do.call(rbind,
lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L)))
colnames(parts) <- c("protocol","host","port","path")
parts
}
URL_parts(x)
## gregexec() may match multiple times within a single string.
pattern <- "([[:alpha:]]+)([[:digit:]]+)"
s <- "Test: A1 BC23 DEF456"
m <- gregexec(pattern, s)
m
regmatches(s, m)
## Before gregexec() was implemented, one could emulate it by running
## regexec() on the regmatches obtained via gregexpr(). E.g.:
lapply(regmatches(s, gregexpr(pattern, s)),
function(e) regmatches(e, regexec(pattern, e)))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment