Verified Commit 55fd6e44 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

update pipeline from git@gitbio.ens-lyon.fr:LBMC/Bernard/fact_condensin.git

parent 805010e7
FROM lbmc/r-base:4.0.3
RUN R -e "install.packages(\"devtools\", repos=\"https://cloud.r-project.org\")"
RUN R -e "devtools::install_github(\"tidyverse/readxl\")"
RUN R -e "devtools::install_github(\"tidyverse/tidyverse\")"
RUN R -e "install.packages(\"vroom\", repos=\"https://cloud.r-project.org\")"
## copy files
COPY mapping_stats.R /usr/bin/
COPY normalization.R /usr/bin/
COPY normalization_stats.R /usr/bin/
RUN chmod +x /usr/bin/*.R
#!/bin/sh
docker pull lbmc/chip_quant_r:0.0.3
docker build src/.docker_modules/chip_quant_r/0.0.3 -t 'lbmc/chip_quant_r:0.0.3'
docker push lbmc/chip_quant_r:0.0.3
#!/usr/bin/Rscript
library("tidyverse")
load_stats <- function(file_name) {
tsv_cols_names <- c("value", "test", "variable")
read_tsv(file_name, col_names = tsv_cols_names) %>%
select(-c(test)) %>%
drop_na() %>%
filter(!(variable %>% str_detect("%"))) %>%
mutate(
type = file_name %>% str_replace("(.*)\\.tsv", "\\1"),
value = value %>% as.numeric(),
variable = variable %>% as.factor(),
type = type %>% as.factor()
)
}
data <- load_stats("all.tsv") %>%
bind_rows(load_stats("both.tsv")) %>%
bind_rows(load_stats("fasta.tsv")) %>%
bind_rows(load_stats("fasta_calib.tsv"))
data %>%
ggplot(data = .) +
geom_col(aes(x = value, y = variable, group = type, fill = type),
position = "dodge") +
theme_bw()
ggsave("plot.pdf")
data %>% write_tsv("stats.tsv")
#!/usr/bin/Rscript
library("tidyverse")
library("vroom")
# compute OR value
load_stat <- function(file) {
vroom(
file,
col_names = c("value", "name", "type"),
col_types = list("d", "f", "f")
) %>%
filter(str_detect(type, "fasta.*")) %>%
filter(str_detect(name, "^mapped$"))
}
or_data <- tibble(file = list.files()) %>%
filter(str_detect(file, ".*\\.tsv")) %>%
mutate(sample = ifelse(str_detect(file, "^IP_.*"), "ip", "w")) %>%
mutate(mapping_stats = map(file, load_stat)) %>%
unnest(c(mapping_stats))
extract_value <- function(data, sample_name, type_name) {
data %>%
filter(sample == sample_name & type == type_name) %>%
pull(value)
}
get_or <- function(or_data, file) {
or <- (
(or_data %>% extract_value("w", "fasta_calib")) *
10e6
) / (
(or_data %>% extract_value("w", "fasta")) *
(or_data %>% extract_value("ip", "fasta_calib"))
)
vroom_write(
tibble(
fasta_calib = c(
or_data %>% extract_value("ip", "fasta_calib"),
or_data %>% extract_value("w", "fasta_calib")
),
fasta = c(
or_data %>% extract_value("ip", "fasta"),
or_data %>% extract_value("w", "fasta")
),
type = c("ip", "w"),
or = c(or, or)
),
path = str_c(file, "_OR.tsv")
)
return(or)
}
# normalize bg files
norm_bg <- function(file, or) {
vroom(
file,
col_names = c("chr", "start", "stop", "density"),
col_types = list("f", "i", "i", "d")
) %>%
mutate(density = density * or) %>%
vroom_write(
path = str_c("norm_", file),
col_names = FALSE
)
return(0)
}
bg_files <- tibble(file = list.files()) %>%
filter(str_detect(file, ".*\\.bg$")) %>%
mutate(
mapping_stats = map(file, function(x, or_data) {
norm_bg(x, get_or(or_data, x))
}, or_data = or_data)
)
#!/usr/bin/Rscript
library("tidyverse")
library("vroom")
data <- tibble(file = list.files()) %>%
mutate(
bg = map(file, function(x){
vroom(
x,
col_names = c("chr", "start", "stop", "count"),
col_types = list("f", "i", "i", "d")
)
})) %>%
mutate(file = str_remove(file, "\\.bg")) %>%
mutate(file = as_factor(file)) %>%
mutate(bg_s = map( bg, function(x){sample_frac(x, 0.1)})) %>%
unnest(bg_s)
data %>%
filter(count > quantile(count, 0.02)) %>%
filter(count < quantile(count, 0.98)) %>%
filter(str_detect(file, "IP")) %>%
group_by(file) %>%
mutate(median = median(count)) %>%
ggplot() +
geom_histogram(aes(
x = count,
fill = file
), bins = 50, position = "dodge") +
geom_vline(aes(xintercept = median)) +
facet_wrap(~file, scales = "free") +
theme_bw()
ggsave("plot.pdf")
FROM lbmc/r-base:4.0.3
RUN R -e "install.packages(\"devtools\", repos=\"https://cloud.r-project.org\")"
RUN R -e "devtools::install_github(\"tidyverse/readxl\")"
RUN R -e "devtools::install_github(\"tidyverse/tidyverse\")"
RUN R -e "install.packages(\"vroom\", repos=\"https://cloud.r-project.org\")"
## copy files
COPY mapping_stats.R /usr/bin/
COPY normalization.R /usr/bin/
COPY normalization_mnase.R /usr/bin/
COPY normalization_stats.R /usr/bin/
RUN chmod +x /usr/bin/*.R
#!/bin/sh
docker pull lbmc/chip_quant_r:0.0.4
docker build src/.docker_modules/chip_quant_r/0.0.4 -t 'lbmc/chip_quant_r:0.0.4'
docker push lbmc/chip_quant_r:0.0.4
#!/usr/bin/Rscript
library("tidyverse")
load_stats <- function(file_name) {
tsv_cols_names <- c("value", "test", "variable")
read_tsv(file_name, col_names = tsv_cols_names) %>%
select(-c(test)) %>%
drop_na() %>%
filter(!(variable %>% str_detect("%"))) %>%
mutate(
type = file_name %>% str_replace("(.*)\\.tsv", "\\1"),
value = value %>% as.numeric(),
variable = variable %>% as.factor(),
type = type %>% as.factor()
)
}
data <- load_stats("all.tsv") %>%
bind_rows(load_stats("both.tsv")) %>%
bind_rows(load_stats("fasta.tsv")) %>%
bind_rows(load_stats("fasta_calib.tsv"))
data %>%
ggplot(data = .) +
geom_col(aes(x = value, y = variable, group = type, fill = type),
position = "dodge") +
theme_bw()
ggsave("plot.pdf")
data %>% write_tsv("stats.tsv")
#!/usr/bin/Rscript
library("tidyverse")
library("vroom")
# compute OR value
load_stat <- function(file) {
vroom(
file,
col_names = c("value", "name", "type"),
col_types = list("d", "f", "f")
) %>%
filter(str_detect(type, "fasta.*")) %>%
filter(str_detect(name, "^mapped$"))
}
or_data <- tibble(file = list.files()) %>%
filter(str_detect(file, ".*\\.tsv")) %>%
mutate(sample = ifelse(str_detect(file, "^IP_.*"), "ip", "w")) %>%
mutate(mapping_stats = map(file, load_stat)) %>%
unnest(c(mapping_stats))
extract_value <- function(data, sample_name, type_name) {
data %>%
filter(sample == sample_name & type == type_name) %>%
pull(value)
}
get_or <- function(or_data, file) {
or <- (
(or_data %>% extract_value("w", "fasta_calib")) *
10e6
) / (
(or_data %>% extract_value("w", "fasta")) *
(or_data %>% extract_value("ip", "fasta_calib"))
)
vroom_write(
tibble(
fasta_calib = c(
or_data %>% extract_value("ip", "fasta_calib"),
or_data %>% extract_value("w", "fasta_calib")
),
fasta = c(
or_data %>% extract_value("ip", "fasta"),
or_data %>% extract_value("w", "fasta")
),
type = c("ip", "w"),
or = c(or, or)
),
path = str_c(file, "_OR.tsv")
)
return(or)
}
# normalize bg files
norm_bg <- function(file, or) {
vroom(
file,
col_names = c("chr", "start", "stop", "density"),
col_types = list("f", "i", "i", "d")
) %>%
mutate(density = density * or) %>%
vroom_write(
path = str_c("norm_", file),
col_names = FALSE
)
return(0)
}
bg_files <- tibble(file = list.files()) %>%
filter(str_detect(file, ".*\\.bg$")) %>%
mutate(
mapping_stats = map(file, function(x, or_data) {
norm_bg(x, get_or(or_data, x))
}, or_data = or_data)
)
#!/usr/bin/Rscript
library("tidyverse")
library("vroom")
# compute OR value
load_stat <- function(file) {
vroom(
file,
col_names = c("value", "name", "type"),
col_types = list("d", "f", "f")
) %>%
filter(str_detect(type, "fasta.*")) %>%
filter(str_detect(name, "^mapped$"))
}
or_data <- tibble(file = list.files()) %>%
filter(str_detect(file, ".*\\.tsv")) %>%
mutate(sample = ifelse(str_detect(file, "^IP_.*"), "ip", "w")) %>%
mutate(mapping_stats = map(file, load_stat)) %>%
unnest(c(mapping_stats))
extract_value <- function(data, sample_name, type_name) {
data %>%
filter(sample == sample_name & type == type_name) %>%
pull(value)
}
get_or <- function(or_data, file, expected_ratio) {
ip_x <- or_data %>% extract_value("ip", "fasta")
ip_c <- or_data %>% extract_value("ip", "fasta_calib")
correction <- (ip_x * expected_ratio) / (ip_c - expected_ratio * ip_c)
or <- 10e6 / (ip_c * correction)
vroom_write(
tibble(
fasta_calib = c(
ip_c,
or_data %>% extract_value("w", "fasta_calib")
),
fasta = c(
ip_x,
or_data %>% extract_value("w", "fasta")
),
expected_ratio_calib = c(expected_ratio, expected_ratio),
correction = c(correction, correction),
type = c("ip", "w"),
or = c(or, or)
),
path = str_c(file, "_OR.tsv")
)
return(or)
}
# normalize bg files
norm_bg <- function(file, or) {
vroom(
file,
col_names = c("chr", "start", "stop", "density"),
col_types = list("f", "i", "i", "d")
) %>%
mutate(density = density * or) %>%
vroom_write(
path = str_c("norm_", file),
col_names = FALSE
)
return(0)
}
args = commandArgs(trailingOnly=TRUE)
expected_ratio = as.numeric(args[1])
bg_files <- tibble(file = list.files()) %>%
filter(str_detect(file, ".*\\.bg$")) %>%
mutate(
mapping_stats = map(file, function(x, or_data, expected_ratio) {
norm_bg(x, get_or(or_data, x, expected_ratio))
}, or_data = or_data, expected_ratio = expected_ratio)
)
#!/usr/bin/Rscript
library("tidyverse")
library("vroom")
data <- tibble(file = list.files()) %>%
mutate(
bg = map(file, function(x){
vroom(
x,
col_names = c("chr", "start", "stop", "count"),
col_types = list("f", "i", "i", "d")
)
})) %>%
mutate(file = str_remove(file, "\\.bg")) %>%
mutate(file = as_factor(file)) %>%
mutate(bg_s = map( bg, function(x){sample_frac(x, 0.1)})) %>%
unnest(bg_s)
data %>%
filter(count > quantile(count, 0.02)) %>%
filter(count < quantile(count, 0.98)) %>%
filter(str_detect(file, "IP")) %>%
group_by(file) %>%
mutate(median = median(count)) %>%
ggplot() +
geom_histogram(aes(
x = count,
fill = file
), bins = 50, position = "dodge") +
geom_vline(aes(xintercept = median)) +
facet_wrap(~file, scales = "free") +
theme_bw()
ggsave("plot.pdf")
......@@ -79,6 +79,7 @@ params.csv_sra = ""
params.fasta= false
params.fasta_calib= false
params.macs_gsize = false
params.index_fasta_out = ""
// Show help message
if (params.help) {
......@@ -116,6 +117,20 @@ Channel
.value(params.macs_gsize)
.set{ macs_gsize }
Channel
.fromList(evaluate(params.metagene_group))
.set{ metagene_group }
Channel
.fromPath( params.rm_from_genome, checkIfExists: true )
.map { it -> [it.simpleName, it]}
.set{ rm_from_genome }
Channel
.fromPath( params.gff, checkIfExists: true )
.map { it -> [it.simpleName, it]}
.set{ gff_files }
Channel
.fromPath( params.fasta, checkIfExists: true )
.map { it -> [it.simpleName, it]}
......@@ -162,6 +177,15 @@ include {
csv_parsing;
} from './csv_parsing.nf'
params.coverage = ""
params.size = ""
include {
sample_fastq
} from './nf_modules/rasusa/main' addParams(
sample_fastq_coverage: params.coverage,
sample_fastq_size: params.size
)
include {
fastp
} from './nf_modules/fastp/main'
......@@ -174,14 +198,22 @@ include {
coverage_analysis;
} from './coverage_analysis.nf'
include {
danpos;
} from './danpos.nf' addParams(
metagene_plot_out: "dpeak/",
dpeak_out: "dpeak/"
)
include {
peak_calling;
} from './peak_calling.nf'
include { flagstat_2_multiqc } from './nf_modules/samtools/main' addParams(
multiqc_out: "QC/"
include { flagstat_2_multiqc } from './nf_modules/samtools/main'
include { multiqc } from './nf_modules/multiqc/main' addParams(
multiqc_out: "QC/chipseq/"
)
include { multiqc } from './nf_modules/multiqc/main'
/*
===============================================================================
......@@ -194,16 +226,20 @@ workflow {
csv_parsing()
if (params.csv_sra.size() > 0) {
fastq_dump(csv_parsing.out.map{ it -> it.file }.unique())
fastp(get_fastq.out.join(input_csv, by: 1))
sample_fastq(csv_parsing.out.map{ it -> it.file }.unique(), fasta.collect())
fastp(sample_fastq.out.fastq)
} else {
fastp(csv_parsing.out.map{ it -> it.file }.unique())
sample_fastq(csv_parsing.out.map{ it -> it.file }.unique(), fasta.collect())
fastp(sample_fastq.out.fastq)
}
// mapping
mapping(
fasta.collect(),
fasta_calib.collect(),
fastp.out.fastq
fastp.out.fastq,
rm_from_genome.collect(),
csv_parsing.out
)
// coverage analysis
......@@ -217,23 +253,30 @@ workflow {
)
/* peak calling */
peak_calling(
mapping.out.bam,
csv_parsing.out,
coverage_analysis.out.bg
// peak_calling(
// mapping.out.bam,
// csv_parsing.out,
// coverage_analysis.out.bg
// )
danpos(
fastp.out.fastq,
mapping.out.fasta,
coverage_analysis.out.bw,
gff_files,
metagene_group
)
/* quality control */
// /* quality control */
flagstat_2_multiqc(coverage_analysis.out.report)
fastp.out.report
.mix(
mapping.out.report,
flagstat_2_multiqc.out.report,
peak_calling.out.report,
coverage_analysis.out.report,
// peak_calling.out.report,
)
.set { multiqc_report }
multiqc(
multiqc_report.collect()
multiqc_report
)
}
......@@ -28,7 +28,7 @@ ln -s ${stats} ${stats_label}.tsv
}
process mapping_stats {
container = "lbmc/chip_quant_r:0.0.2"
container = "lbmc/chip_quant_r:0.0.3"
label 'big_mem_mono_cpus'
tag "$file_id"
publishDir "results/mapping/stats/", mode: 'copy'
......@@ -54,7 +54,7 @@ mv stats.tsv ${file_prefix}.tsv
}
process coverage_normalization {
container = "lbmc/chip_quant_r:0.0.2"
container = "lbmc/chip_quant_r:0.0.3"
label 'big_mem_mono_cpus'
tag "${file_id}"
publishDir "results/coverage/stats/", mode: 'copy', pattern: '*.tsv'
......@@ -89,7 +89,7 @@ process bedgraph_to_bigwig {
tuple val(bed_calib_id), path(bed_fasta_calib)
output:
tuple val(file_id), path("*.bw"), emit: bw
tuple val(file_id), path("${file_id.WCE}_norm.bw"), path("${file_id.IP}_norm.bw"), emit: bw
script:
"""
......@@ -126,7 +126,7 @@ rm sort_${bg_ip_norm}
}
process normalization_stats {
container = "lbmc/chip_quant_r:0.0.2"
container = "lbmc/chip_quant_r:0.0.3"
label 'big_mem_mono_cpus'
tag "$file_id"
publishDir "results/coverage/stats/", mode: 'copy'
......@@ -182,27 +182,22 @@ workflow coverage_analysis {
<