From f043201930df8bd09f9d46aa90146f95695075ab Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent@modolo.fr>
Date: Tue, 9 Oct 2018 13:24:28 +0200
Subject: [PATCH] intersect_SNP.R: add enzyme selection

---
 src/1_JU28_59vs17_SNP_calling.sh |  3 +-
 src/intersect_SNP.R              | 73 ++++++++++++++++++++++++++++----
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/src/1_JU28_59vs17_SNP_calling.sh b/src/1_JU28_59vs17_SNP_calling.sh
index b9b8056..42f5adb 100644
--- a/src/1_JU28_59vs17_SNP_calling.sh
+++ b/src/1_JU28_59vs17_SNP_calling.sh
@@ -23,6 +23,7 @@ cd tests
 src/intersect_SNP.R \
   results/SNP/vcf_samtools/normal_sample_filtered.csv \
   results/SNP/vcf_samtools/tumor_sample_filtered.csv \
-  results/fasta/DBG2OLC_output2_filtered.fasta
+  results/fasta/DBG2OLC_output2_filtered.fasta \
+  data/list_of_enzymes.csv
 ~/scripts/sms.sh "SNP analysis done"
 
diff --git a/src/intersect_SNP.R b/src/intersect_SNP.R
index 298b878..19999d1 100755
--- a/src/intersect_SNP.R
+++ b/src/intersect_SNP.R
@@ -1,34 +1,56 @@
 #!/usr/bin/Rscript
+rm(list = ls())
 library("tidyverse")
 library("seqinr")
 
+args <- c(
+  "results/SNP/vcf_samtools/normal_sample_filtered.csv",
+  "results/SNP/vcf_samtools/tumor_sample_filtered.csv",
+  "results/fasta/DBG2OLC_output2_filtered.fasta",
+  "data/list_of_enzymes.csv"
+  )
+seq_restric_size <- 21
+
 args <- commandArgs(trailingOnly = TRUE)
 snp_a <- read_delim(args[1], delim = "\t") %>%
   mutate(cords = paste0(CHROM, POS))
 snp_b <- read_delim(args[2], delim = "\t") %>%
   mutate(cords = paste0(CHROM, POS))
 
-
 only_b <- snp_b %>%
   select(cords) %>%
   setdiff(snp_a %>% select(cords)) %>%
   pull(cords)
+
 snp <- snp_b %>%
-  filter(cords %in% only_b)
+  filter(cords %in% only_b) %>%
+  filter(tumor_sample.GT %in% c("A/A", "T/T", "G/G", "C/C")) %>%
+  mutate(REF = do.call(rbind, strsplit(AD, split = ",", fixed = TRUE))[,1],
+         VAR = do.call(rbind, strsplit(AD, split = ",", fixed = TRUE))[,2],
+         REF = as.integer(REF),
+         VAR = as.integer(VAR),
+         tumor_sample.AD = NULL,
+         cords = NULL
+  ) %>%
+  filter(REF == 0) %>%
+  filter(VAR >= 10) %>%
+  arrange(CHROM, desc(VAR))
 
 fastafile <- read.fasta(file = args[3],
                         as.string = TRUE)
 
 snp$seq_list <- snp %>%
-  apply(1, FUN = function(x, fastafile, POS, CHROM){
-      begin <- as.integer(x[ POS ]) - 10
-      end <- as.integer(x[ POS ]) + 10
+  apply(1, FUN = function(x, fastafile, POS, CHROM, seq_restric_size){
+      begin <- as.integer(x[ POS ]) - ((seq_restric_size - 1) / 2)
+      end <- as.integer(x[ POS ]) + ((seq_restric_size - 1) / 2)
       chrom <- x[ CHROM ]
       seq_restric <- fastafile[[ chrom ]] %>%
         c2s() %>%
         substr(begin, end) %>%
         s2c()
-      seq_restric[11] <- toupper(seq_restric[11])
+      seq_restric[((seq_restric_size - 1) / 2) + 1] <-
+        toupper(seq_restric[((seq_restric_size - 1) / 2) + 1])
+      print(paste0(chrom, ":", begin, "-", end, " ", seq_restric %>% c2s()))
       return(seq_restric %>% c2s())
     },
     fastafile = fastafile,
@@ -37,6 +59,41 @@ snp$seq_list <- snp %>%
   )
 
 snp %>%
-  arrange(desc(tumor_sample.AF)) %>%
-  write_csv(paste0("only_", args[2]))
+  write_csv(paste0(args[2], "only.csv" ))
+
+
+snp <- read_csv(paste0(args[2], "only.csv" ))
+enzyme_list <- read_csv(args[4]) %>%
+  mutate(size = nchar(seq))
+
+snp <- snp %>%
+  mutate(enzyme = NA,
+         enzyme_pos = NA,
+         enzyme_seq = NA)
+for (i in seq_len(nrow(enzyme_list))) {
+  enzyme <- enzyme_list[i, ]
+  enzyme_search <- gregexpr(toupper(enzyme$seq),
+                            toupper(snp$seq_list),
+                            fixed = TRUE) %>%
+    lapply(FUN = function(x, enzyme, seq_restric_size) {
+      if (x[1] < 0) {
+        return(c(NA, NA, NA))
+      } else {
+        if (x[1] <= (seq_restric_size - 1) / 2 + 1 &
+            x[1] + enzyme$size >=  (seq_restric_size - 1) / 2 + 1) {
+        return(c(enzyme$enzyme, x[1], enzyme$seq))
+        }
+        return(c(NA, NA, NA))
+      }
+    },
+    enzyme = enzyme,
+    seq_restric_size = seq_restric_size)
+  enzyme_search <- do.call(rbind, enzyme_search)
+  snp[is.na(snp$enzyme), c("enzyme", "enzyme_pos", "enzyme_seq")] <-
+    enzyme_search[is.na(snp$enzyme), ]
+}
+
+snp %>%
+  filter(!is.na(enzyme)) %>%
+  write_csv(paste0(args[2], "only_enzyme.csv" ))
 
-- 
GitLab