diff --git a/README.md b/README.md
index 178b7f48231096898660952086dff1465ab21d55..5e6a6eca82a29e94e34327dd6699b9f8acb8f819 100755
--- a/README.md
+++ b/README.md
@@ -176,6 +176,17 @@ run prank, trimal and phyml on the fasta output from step 4.
 ## 6. To do
 
 
+16/11/2021
+Les script 4 mache sur lpg2300 et récupère 49 séquences 
+
+--> checker quelles espèces sont présentes et cohérence avec résultats de Margaux + quels sont les espèces manquantes
+
+--> Faire en sorte que ce script marche aussi sur plusieurs gènes
+
+
+
+
+
 - Adapt the pipeline to use more than one gene, make concatenates, and format presence/absence for all genes.
 => make script 3_run_PSIblast.sh
 to blast all 78 genes retrieved from uniprot on the specific database designed for phylogeny.
diff --git a/script/.4_parse_PSIblastTMP.R.swp b/script/.4_parse_PSIblastTMP.sh.swp
similarity index 57%
rename from script/.4_parse_PSIblastTMP.R.swp
rename to script/.4_parse_PSIblastTMP.sh.swp
index 2deb9f68dbe6f03a5b268b6bf6e37a48c7d306a2..ec2f9478529362620ec5b338ea323d8923ffbeea 100644
Binary files a/script/.4_parse_PSIblastTMP.R.swp and b/script/.4_parse_PSIblastTMP.sh.swp differ
diff --git a/script/4_parse_PSIblastTMP.R b/script/4_parse_PSIblastTMP.R
index 38af144dc65daca4453b8c32d669f21f64ea8029..68c90c714e29cf95a578a7d11ad86b1de90a60a1 100644
--- a/script/4_parse_PSIblastTMP.R
+++ b/script/4_parse_PSIblastTMP.R
@@ -8,7 +8,7 @@ if (length(args)!=6) {
 
 #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
 #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
-#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta")
+#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta")
 
 print("reading arguments and data...")
 
@@ -84,17 +84,34 @@ blastn$sseqid<-unlist(blastn$sseqid)
 blastn<-blastn[blastn$sseqid %in% listid,]
 print(paste("Keep best hit for each species, nrow = ", nrow(blastn)))
 
+listid_short<-sapply(listid, function(x) substr(x, 1, nchar(x)-2))
 #### Retrieve sequences via blastcmd
-for (seq in listid2){ 
+system(paste0("rm ", fasta))
+for (seq in listid_short){ 
 cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ",  seq, " >> ", fasta)
 system(cmd)
 }
 
+
 #### Change names in the fasta
+library(ape)
+
+aln<-read.dna(fasta, format="fasta")
+
+names(listid_short)<-gsub(names(listid_short), pattern=" ", replacement="_")
+
+names(aln)<-sapply(names(aln), function(x) strsplit(x, " ")[[1]][1])
+
 
+newnames<-sapply(names(aln), function(x){
+ names(listid_short)[listid_short==x]
+})
 
+names(aln)<-newnames
 
+fasta_aln<-gsub(fasta, pattern=".fasta", replacement="_aln.fasta", fixed=TRUE)
 
+write.dna(aln, file=fasta_aln, format="fasta", nbcol = 6, colsep = "", colw = 10)
 
 
 
diff --git a/script/4_parse_PSIblastTMP.sh b/script/4_parse_PSIblastTMP.sh
index 05ee772c9b240ac491168dd4b4410445658abe65..b1367785a41795326a10951d4f5ab54840369656 100755
--- a/script/4_parse_PSIblastTMP.sh
+++ b/script/4_parse_PSIblastTMP.sh
@@ -30,7 +30,7 @@ FASTA_REP=$3
 mkdir -p $FASTA_REP
 
 #GENES=`grep ">" $4 | sed 's/>//g'`
-Genes="lpg2300"
+Gene="lpg2300"
 
 
 EVAL=$5
@@ -55,6 +55,7 @@ echo `wc -l $subblast`
 
 if [[ -s $FASTA ]] ; then
 echo "sequences already retrieved"
+echo $FASTA
 else
 echo "retrieve fasta"
 Rscript --vanilla $rscript $OUT_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA > $FASTA_REP/paste_blast_test.log
diff --git a/script/runscript.sh b/script/runscript.sh
index 4b4ec8f392c532f2eaab50b2e426c62294c119b1..8f721b025c1fe4af4e0f2272627259a86285d4fd 100755
--- a/script/runscript.sh
+++ b/script/runscript.sh
@@ -10,6 +10,8 @@
 ## the dirs must exist before job is launched
 #$ -o /home/mcariou/2021_legio/log/
 #$ -e /home/mcariou/2021_legio/log/
+module load R
+
  
 ./4_parse_PSIblastTMP.sh  ~/2021_legio/out_blastn/lpg2300.psiblast  ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/lpg2300 ~/2021_legio/genes/lpg2300.fasta 0.0001 0.1 0.1 1
 
diff --git "a/ta_aln<-gsub(fasta, pattern=\".fasta\", replacement=\"_aln.fasta\", fixed=TRUE)" "b/ta_aln<-gsub(fasta, pattern=\".fasta\", replacement=\"_aln.fasta\", fixed=TRUE)"
new file mode 100644
index 0000000000000000000000000000000000000000..38a0b9ea13410af82f3b0f07da56e3452c7c0621
--- /dev/null
+++ "b/ta_aln<-gsub(fasta, pattern=\".fasta\", replacement=\"_aln.fasta\", fixed=TRUE)"	
@@ -0,0 +1,413 @@
+grep                   package:base                    R Documentation
+
+_P_a_t_t_e_r_n _M_a_t_c_h_i_n_g _a_n_d _R_e_p_l_a_c_e_m_e_n_t
+
+_D_e_s_c_r_i_p_t_i_o_n:
+
+     ‘grep’, ‘grepl’, ‘regexpr’, ‘gregexpr’, ‘regexec’ and ‘gregexec’
+     search for matches to argument ‘pattern’ within each element of a
+     character vector: they differ in the format of and amount of
+     detail in the results.
+
+     ‘sub’ and ‘gsub’ perform replacement of the first and all matches
+     respectively.
+
+_U_s_a_g_e:
+
+     grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE,
+          fixed = FALSE, useBytes = FALSE, invert = FALSE)
+     
+     grepl(pattern, x, ignore.case = FALSE, perl = FALSE,
+           fixed = FALSE, useBytes = FALSE)
+     
+     sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
+         fixed = FALSE, useBytes = FALSE)
+     
+     gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
+          fixed = FALSE, useBytes = FALSE)
+     
+     regexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
+             fixed = FALSE, useBytes = FALSE)
+     
+     gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
+              fixed = FALSE, useBytes = FALSE)
+     
+     regexec(pattern, text, ignore.case = FALSE, perl = FALSE,
+             fixed = FALSE, useBytes = FALSE)
+     
+     gregexec(pattern, text, ignore.case = FALSE, perl = FALSE,
+             fixed = FALSE, useBytes = FALSE)
+     
+_A_r_g_u_m_e_n_t_s:
+
+ pattern: character string containing a regular expression (or
+          character string for ‘fixed = TRUE’) to be matched in the
+          given character vector.  Coerced by ‘as.character’ to a
+          character string if possible.  If a character vector of
+          length 2 or more is supplied, the first element is used with
+          a warning.  Missing values are allowed except for ‘regexpr’,
+          ‘gregexpr’ and ‘regexec’.
+
+ x, text: a character vector where matches are sought, or an object
+          which can be coerced by ‘as.character’ to a character vector.
+          Long vectors are supported.
+
+ignore.case: if ‘FALSE’, the pattern matching is _case sensitive_ and
+          if ‘TRUE’, case is ignored during matching.
+
+    perl: logical.  Should Perl-compatible regexps be used?
+
+   value: if ‘FALSE’, a vector containing the (‘integer’) indices of
+          the matches determined by ‘grep’ is returned, and if ‘TRUE’,
+          a vector containing the matching elements themselves is
+          returned.
+
+   fixed: logical.  If ‘TRUE’, ‘pattern’ is a string to be matched as
+          is.  Overrides all conflicting arguments.
+
+useBytes: logical.  If ‘TRUE’ the matching is done byte-by-byte rather
+          than character-by-character.  See ‘Details’.
+
+  invert: logical.  If ‘TRUE’ return indices or values for elements
+          that do _not_ match.
+
+replacement: a replacement for matched pattern in ‘sub’ and ‘gsub’.
+          Coerced to character if possible.  For ‘fixed = FALSE’ this
+          can include backreferences ‘"\1"’ to ‘"\9"’ to parenthesized
+          subexpressions of ‘pattern’.  For ‘perl = TRUE’ only, it can
+          also contain ‘"\U"’ or ‘"\L"’ to convert the rest of the
+          replacement to upper or lower case and ‘"\E"’ to end case
+          conversion.  If a character vector of length 2 or more is
+          supplied, the first element is used with a warning.  If ‘NA’,
+          all elements in the result corresponding to matches will be
+          set to ‘NA’.
+
+_D_e_t_a_i_l_s:
+
+     Arguments which should be character strings or character vectors
+     are coerced to character if possible.
+
+     Each of these functions operates in one of three modes:
+
+       1. ‘fixed = TRUE’: use exact matching.
+
+       2. ‘perl = TRUE’: use Perl-style regular expressions.
+
+       3. ‘fixed = FALSE, perl = FALSE’: use POSIX 1003.2 extended
+          regular expressions (the default).
+
+     See the help pages on regular expression for details of the
+     different types of regular expressions.
+
+     The two ‘*sub’ functions differ only in that ‘sub’ replaces only
+     the first occurrence of a ‘pattern’ whereas ‘gsub’ replaces all
+     occurrences.  If ‘replacement’ contains backreferences which are
+     not defined in ‘pattern’ the result is undefined (but most often
+     the backreference is taken to be ‘""’).
+
+     For ‘regexpr’, ‘gregexpr’, ‘regexec’ and ‘gregexec’ it is an error
+     for ‘pattern’ to be ‘NA’, otherwise ‘NA’ is permitted and gives an
+     ‘NA’ match.
+
+     Both ‘grep’ and ‘grepl’ take missing values in ‘x’ as not matching
+     a non-missing ‘pattern’.
+
+     The main effect of ‘useBytes = TRUE’ is to avoid errors/warnings
+     about invalid inputs and spurious matches in multibyte locales,
+     but for ‘regexpr’ it changes the interpretation of the output.  It
+     inhibits the conversion of inputs with marked encodings, and is
+     forced if any input is found which is marked as ‘"bytes"’ (see
+     ‘Encoding’).
+
+     Caseless matching does not make much sense for bytes in a
+     multibyte locale, and you should expect it only to work for ASCII
+     characters if ‘useBytes = TRUE’.
+
+     ‘regexpr’ and ‘gregexpr’ with ‘perl = TRUE’ allow Python-style
+     named captures, but not for _long vector_ inputs.
+
+     Invalid inputs in the current locale are warned about up to 5
+     times.
+
+     Caseless matching with ‘perl = TRUE’ for non-ASCII characters
+     depends on the PCRE library being compiled with ‘Unicode property
+     support’, which PCRE2 is by default.
+
+_V_a_l_u_e:
+
+     ‘grep(value = FALSE)’ returns a vector of the indices of the
+     elements of ‘x’ that yielded a match (or not, for ‘invert =
+     TRUE’).  This will be an integer vector unless the input is a
+     _long vector_, when it will be a double vector.
+
+     ‘grep(value = TRUE)’ returns a character vector containing the
+     selected elements of ‘x’ (after coercion, preserving names but no
+     other attributes).
+
+     ‘grepl’ returns a logical vector (match or not for each element of
+     ‘x’).
+
+     ‘sub’ and ‘gsub’ return a character vector of the same length and
+     with the same attributes as ‘x’ (after possible coercion to
+     character).  Elements of character vectors ‘x’ which are not
+     substituted will be returned unchanged (including any declared
+     encoding).  If ‘useBytes = FALSE’ a non-ASCII substituted result
+     will often be in UTF-8 with a marked encoding (e.g., if there is a
+     UTF-8 input, and in a multibyte locale unless ‘fixed = TRUE’).
+     Such strings can be re-encoded by ‘enc2native’.
+
+     ‘regexpr’ returns an integer vector of the same length as ‘text’
+     giving the starting position of the first match or -1 if there is
+     none, with attribute ‘"match.length"’, an integer vector giving
+     the length of the matched text (or -1 for no match).  The match
+     positions and lengths are in characters unless ‘useBytes = TRUE’
+     is used, when they are in bytes (as they are for ASCII-only
+     matching: in either case an attribute ‘useBytes’ with value ‘TRUE’
+     is set on the result).  If named capture is used there are further
+     attributes ‘"capture.start"’, ‘"capture.length"’ and
+     ‘"capture.names"’.
+
+     ‘gregexpr’ returns a list of the same length as ‘text’ each
+     element of which is of the same form as the return value for
+     ‘regexpr’, except that the starting positions of every (disjoint)
+     match are given.
+
+     ‘regexec’ returns a list of the same length as ‘text’ each element
+     of which is either -1 if there is no match, or a sequence of
+     integers with the starting positions of the match and all
+     substrings corresponding to parenthesized subexpressions of
+     ‘pattern’, with attribute ‘"match.length"’ a vector giving the
+     lengths of the matches (or -1 for no match).  The interpretation
+     of positions and length and the attributes follows ‘regexpr’.
+
+     ‘gregexec’ returns the same as ‘regexec’, except that to
+     accommodate multiple matches per element of ‘text’, the integer
+     sequences for each match are made into columns of a matrix, with
+     one matrix per element of ‘text’ with matches.
+
+     Where matching failed because of resource limits (especially for
+     ‘perl = TRUE’) this is regarded as a non-match, usually with a
+     warning.
+
+_W_a_r_n_i_n_g:
+
+     The POSIX 1003.2 mode of ‘gsub’ and ‘gregexpr’ does not work
+     correctly with repeated word-boundaries (e.g., ‘pattern = "\b"’).
+     Use ‘perl = TRUE’ for such matches (but that may not work as
+     expected with non-ASCII inputs, as the meaning of ‘word’ is
+     system-dependent).
+
+_P_e_r_f_o_r_m_a_n_c_e _c_o_n_s_i_d_e_r_a_t_i_o_n_s:
+
+     If you are doing a lot of regular expression matching, including
+     on very long strings, you will want to consider the options used.
+     Generally ‘perl = TRUE’ will be faster than the default regular
+     expression engine, and ‘fixed = TRUE’ faster still (especially
+     when each pattern is matched only a few times).
+
+     If you are working in a single-byte locale and have marked UTF-8
+     strings that are representable in that locale, convert them first
+     as just one UTF-8 string will force all the matching to be done in
+     Unicode, which attracts a penalty of around 3x for the default
+     POSIX 1003.2 mode.
+
+     If you can make use of ‘useBytes = TRUE’, the strings will not be
+     checked before matching, and the actual matching will be faster.
+     Often byte-based matching suffices in a UTF-8 locale since byte
+     patterns of one character never match part of another.  Character
+     ranges may produce unexpected results.
+
+     PCRE-based matching by default used to put additional effort into
+     ‘studying’ the compiled pattern when ‘x’/‘text’ has length 10 or
+     more.  That study may use the PCRE JIT compiler on platforms where
+     it is available (see ‘pcre_config’). As from PCRE2 (PCRE version
+     >= 10.00 as reported by ‘extSoftVersion’), there is no study
+     phase, but the patterns are optimized automatically when possible,
+     and PCRE JIT is used when enabled.  The details are controlled by
+     ‘options’ ‘PCRE_study’ and ‘PCRE_use_JIT’.  (Some timing
+     comparisons can be seen by running file ‘tests/PCRE.R’ in the R
+     sources (and perhaps installed).)  People working with PCRE and
+     very long strings can adjust the maximum size of the JIT stack by
+     setting environment variable ‘R_PCRE_JIT_STACK_MAXSIZE’ before JIT
+     is used to a value between ‘1’ and ‘1000’ in MB: the default is
+     ‘64’. When JIT is not used with PCRE version < 10.30 (that is with
+     PCRE1 and old versions of PCRE2), it might also be wise to set the
+     option ‘PCRE_limit_recursion’.
+
+_N_o_t_e:
+
+     Aspects will be platform-dependent as well as local-dependent: for
+     example the implementation of character classes (except
+     ‘[:digit:]’ and ‘[:xdigit:]’).  One can expect results to be
+     consistent for ASCII inputs and when working in UTF-8 mode (when
+     most platforms will use Unicode character tables, although those
+     are updated frequently and subject to some degree of
+     interpretation - is a circled capital letter alphabetic or a
+     symbol?).  However, results in 8-bit encodings can differ
+     considerably between platforms, modes and from the UTF-8 versions.
+
+_S_o_u_r_c_e:
+
+     The C code for POSIX-style regular expression matching has changed
+     over the years.  As from R 2.10.0 (Oct 2009) the TRE library of
+     Ville Laurikari (<URL: https://github.com/laurikari/tre>) is used.
+     The POSIX standard does give some room for interpretation,
+     especially in the handling of invalid regular expressions and the
+     collation of character ranges, so the results will have changed
+     slightly over the years.
+
+     For Perl-style matching PCRE2 or PCRE (<URL:
+     https://www.pcre.org>) is used: again the results may depend
+     (slightly) on the version of PCRE in use.
+
+_R_e_f_e_r_e_n_c_e_s:
+
+     Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S
+     Language_.  Wadsworth & Brooks/Cole (‘grep’)
+
+_S_e_e _A_l_s_o:
+
+     regular expression (aka ‘regexp’) for the details of the pattern
+     specification.
+
+     ‘regmatches’ for extracting matched substrings based on the
+     results of ‘regexpr’, ‘gregexpr’ and ‘regexec’.
+
+     ‘glob2rx’ to turn wildcard matches into regular expressions.
+
+     ‘agrep’ for approximate matching.
+
+     ‘charmatch’, ‘pmatch’ for partial matching, ‘match’ for matching
+     to whole strings, ‘startsWith’ for matching of initial parts of
+     strings.
+
+     ‘tolower’, ‘toupper’ and ‘chartr’ for character translations.
+
+     ‘apropos’ uses regexps and has more examples.
+
+     ‘grepRaw’ for matching raw vectors.
+
+     Options ‘PCRE_limit_recursion’, ‘PCRE_study’ and ‘PCRE_use_JIT’.
+
+     ‘extSoftVersion’ for the versions of regex and PCRE libraries in
+     use, ‘pcre_config’ for more details for PCRE.
+
+_E_x_a_m_p_l_e_s:
+
+     grep("[a-z]", letters)
+     
+     txt <- c("arm","foot","lefroo", "bafoobar")
+     if(length(i <- grep("foo", txt)))
+        cat("'foo' appears at least once in\n\t", txt, "\n")
+     i # 2 and 4
+     txt[i]
+     
+     ## Double all 'a' or 'b's;  "\" must be escaped, i.e., 'doubled'
+     gsub("([ab])", "\\1_\\1_", "abc and ABC")
+     
+     txt <- c("The", "licenses", "for", "most", "software", "are",
+       "designed", "to", "take", "away", "your", "freedom",
+       "to", "share", "and", "change", "it.",
+       "", "By", "contrast,", "the", "GNU", "General", "Public", "License",
+       "is", "intended", "to", "guarantee", "your", "freedom", "to",
+       "share", "and", "change", "free", "software", "--",
+       "to", "make", "sure", "the", "software", "is",
+       "free", "for", "all", "its", "users")
+     ( i <- grep("[gu]", txt) ) # indices
+     stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) )
+     
+     ## Note that for some implementations character ranges are
+     ## locale-dependent (but not currently).  Then [b-e] in locales such as
+     ## en_US may include B as the collation order is aAbBcCdDe ...
+     (ot <- sub("[b-e]",".", txt))
+     txt[ot != gsub("[b-e]",".", txt)]#- gsub does "global" substitution
+     ## In caseless matching, ranges include both cases:
+     a <- grep("[b-e]", txt, value = TRUE)
+     b <- grep("[b-e]", txt, ignore.case = TRUE, value = TRUE)
+     setdiff(b, a)
+     
+     txt[gsub("g","#", txt) !=
+         gsub("g","#", txt, ignore.case = TRUE)] # the "G" words
+     
+     regexpr("en", txt)
+     
+     gregexpr("e", txt)
+     
+     ## Using grepl() for filtering
+     ## Find functions with argument names matching "warn":
+     findArgs <- function(env, pattern) {
+       nms <- ls(envir = as.environment(env))
+       nms <- nms[is.na(match(nms, c("F","T")))] # <-- work around "checking hack"
+       aa <- sapply(nms, function(.) { o <- get(.)
+                    if(is.function(o)) names(formals(o)) })
+       iw <- sapply(aa, function(a) any(grepl(pattern, a, ignore.case=TRUE)))
+       aa[iw]
+     }
+     findArgs("package:base", "warn")
+     
+     ## trim trailing white space
+     str <- "Now is the time      "
+     sub(" +$", "", str)  ## spaces only
+     ## what is considered 'white space' depends on the locale.
+     sub("[[:space:]]+$", "", str) ## white space, POSIX-style
+     ## what PCRE considered white space changed in version 8.34: see ?regex
+     sub("\\s+$", "", str, perl = TRUE) ## PCRE-style white space
+     
+     ## capitalizing
+     txt <- "a test of capitalizing"
+     gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE)
+     gsub("\\b(\\w)",    "\\U\\1",       txt, perl=TRUE)
+     
+     txt2 <- "useRs may fly into JFK or laGuardia"
+     gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
+      sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
+     
+     ## named capture
+     notables <- c("  Ben Franklin and Jefferson Davis",
+                   "\tMillard Fillmore")
+     # name groups 'first' and 'last'
+     name.rex <- "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"
+     (parsed <- regexpr(name.rex, notables, perl = TRUE))
+     gregexpr(name.rex, notables, perl = TRUE)[[2]]
+     parse.one <- function(res, result) {
+       m <- do.call(rbind, lapply(seq_along(res), function(i) {
+         if(result[i] == -1) return("")
+         st <- attr(result, "capture.start")[i, ]
+         substring(res[i], st, st + attr(result, "capture.length")[i, ] - 1)
+       }))
+       colnames(m) <- attr(result, "capture.names")
+       m
+     }
+     parse.one(notables, parsed)
+     
+     ## Decompose a URL into its components.
+     ## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html).
+     x <- "http://stat.umn.edu:80/xyz"
+     m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
+     m
+     regmatches(x, m)
+     ## Element 3 is the protocol, 4 is the host, 6 is the port, and 7
+     ## is the path.  We can use this to make a function for extracting the
+     ## parts of a URL:
+     URL_parts <- function(x) {
+         m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
+         parts <- do.call(rbind,
+                          lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L)))
+         colnames(parts) <- c("protocol","host","port","path")
+         parts
+     }
+     URL_parts(x)
+     
+     ## gregexec() may match multiple times within a single string.
+     pattern <- "([[:alpha:]]+)([[:digit:]]+)"
+     s <- "Test: A1 BC23 DEF456"
+     m <- gregexec(pattern, s)
+     m
+     regmatches(s, m)
+     
+     ## Before gregexec() was implemented, one could emulate it by running
+     ## regexec() on the regmatches obtained via gregexpr().  E.g.:
+     lapply(regmatches(s, gregexpr(pattern, s)),
+            function(e) regmatches(e, regexec(pattern, e)))
+     
+