From 12b1d9fb428749071270eb3e744523633813fa3c Mon Sep 17 00:00:00 2001 From: Emmanuel Labaronne <emmanuel.labaronne@ens-lyon.fr> Date: Thu, 30 Apr 2020 18:50:02 +0200 Subject: [PATCH] src/FLOSS_score : clean folder to keep only useful files --- src/FLOSS_score/Makefile-align-mesc | 108 ------- src/FLOSS_score/Makefile-align-pull | 91 ------ src/FLOSS_score/analysis.sh | 6 +- src/FLOSS_score/count-calc-hek.R | 50 ---- src/FLOSS_score/count-mapped.R | 14 - src/FLOSS_score/count-plot-hek.R | 265 ----------------- src/FLOSS_score/count-score-hek.R | 92 ------ src/FLOSS_score/floss-calc-mesc.R | 19 -- src/FLOSS_score/floss-plots-mesc_IP.R | 144 --------- ...ss-plots-mesc.R => floss-plots-template.R} | 71 ++--- src/FLOSS_score/human-annotation.R | 1 + src/FLOSS_score/ldist-calc-mesc.R | 76 ----- src/FLOSS_score/ldist-calc-template.R | 47 +++ src/FLOSS_score/ldist-plots-mesc.R | 281 ------------------ src/FLOSS_score/ldist-plots-mesc_ip.R | 281 ------------------ src/FLOSS_score/ldist-plots-template.R | 60 ++++ src/FLOSS_score/mouse-annotation.R | 2 +- src/FLOSS_score/orf-org-plot.R | 117 -------- src/FLOSS_score/orf-organization.R | 205 ------------- src/FLOSS_score/scspike.sh | 31 -- 20 files changed, 132 insertions(+), 1829 deletions(-) delete mode 100644 src/FLOSS_score/Makefile-align-mesc delete mode 100644 src/FLOSS_score/Makefile-align-pull delete mode 100644 src/FLOSS_score/count-calc-hek.R delete mode 100644 src/FLOSS_score/count-mapped.R delete mode 100644 src/FLOSS_score/count-plot-hek.R delete mode 100644 src/FLOSS_score/count-score-hek.R delete mode 100644 src/FLOSS_score/floss-calc-mesc.R delete mode 100644 src/FLOSS_score/floss-plots-mesc_IP.R rename src/FLOSS_score/{floss-plots-mesc.R => floss-plots-template.R} (55%) delete mode 100644 src/FLOSS_score/ldist-calc-mesc.R create mode 100644 src/FLOSS_score/ldist-calc-template.R delete mode 100644 src/FLOSS_score/ldist-plots-mesc.R delete mode 100644 src/FLOSS_score/ldist-plots-mesc_ip.R create mode 100644 src/FLOSS_score/ldist-plots-template.R delete mode 100644 src/FLOSS_score/orf-org-plot.R delete mode 100644 src/FLOSS_score/orf-organization.R delete mode 100644 src/FLOSS_score/scspike.sh diff --git a/src/FLOSS_score/Makefile-align-mesc b/src/FLOSS_score/Makefile-align-mesc deleted file mode 100644 index 9e884e27..00000000 --- a/src/FLOSS_score/Makefile-align-mesc +++ /dev/null @@ -1,108 +0,0 @@ -SHELL=/bin/bash -x - -all: - -.SECONDARY: - -include samples.d - -NEW_SAMPLES=ribo_chx_mesc ribo_emet_mesc -OLD_SAMPLES=ribo_chx_oldmesc ribo_emet_oldmesc ribo_harr120_oldmesc ribo_harr150_oldmesc -RIBO_SAMPLES=$(NEW_SAMPLES) $(OLD_SAMPLES) -SAMPLES=$(RIBO_SAMPLES) -PROJECTS=/mnt/ingolialab/FastQ/121204_SN375_0200_AC1A7LACXX/FastQ/Project_NI-nickingolia-ESdrug/ \ - /mnt/ingolialab/FastQ/130305_SN375_0214_AC1CACACXX/FastQ/Project_NI-nickingolia/ \ - /mnt/ingolialab/FastQ/130709_SN375_0236_BD294UACXX/FastQ-i4n3/Project_NI-nickingolia/ - -ribo_emet_oldmesc_trim.fq: /mnt/ingolialab/ingolia/MouseES/RawData/100426/s_3_sequence.txt.bz2 -ribo_chx_oldmesc_trim.fq: /mnt/ingolialab/ingolia/MouseES/RawData/100426/s_5_sequence.txt.bz2 -ribo_chx_oldmesc_trim.fq: /mnt/ingolialab/ingolia/MouseES/RawData/100809/s_4_sequence.txt.bz2 -ribo_harr120_oldmesc_trim.fq: /mnt/ingolialab/ingolia/MouseES/RawData/100706/s_5_sequence.txt.bz2 -ribo_harr120_oldmesc_trim.fq: /mnt/ingolialab/ingolia/MouseES/RawData/100706/s_6_sequence.txt.bz2 -ribo_harr150_oldmesc_trim.fq: /mnt/ingolialab/ingolia/MouseES/RawData/100624/s_2_sequence.txt.bz2 -ribo_harr150_oldmesc_trim.fq: /mnt/ingolialab/ingolia/MouseES/RawData/100706/s_2_sequence.txt.bz2 - -# RUNTIME CONFIGURATION -BOWTIE_PARALLEL=-p 32 -TOPHAT_KEEP_TEMP=#--keep-tmp - -# LINKER TRIMMING CONFIGURATION -NI_NI_9=CTGTAGGCACCATCAATAGA - -FP_LINKER=$(NI_NI_9) -FP_MIN_INSERT=24 - -# RRNA ALIGNMENT CONFIGURATION -RRNA_MAQ_ERR=60 -RRNA_SEEDLEN=23 -RRNA_MAXREAD=51 - -RRNA_EBWT=/mnt/ingolialab/ingolia/Genomes/SpikeIn/mm-scspike-rrna - -# GENOME ALIGNMENT CONFIGURATION -TOPHAT_EBWT=/mnt/ingolialab/ingolia/Genomes/SpikeIn/mm10-scspike -TOPHAT_GTF=Mm.GRCm38.72-plus-scspike.gtf - -# BINARIES -TOPHAT=tophat -BOWTIE=bowtie -SAMTOOLS=samtools -TAM_TO_BAM=$(SAMTOOLS) view -b -S -h - - -# Fix up chromosome names for Ensembl -Mm.GRCm38.72.gtf: Mus_musculus.GRCm38.72.gtf - awk -F$$'\t' '/^[0-9XY]/ { printf("chr%s\n", $$0) } /^MT/ { gsub(/^MT/, "chrM", $$0); print }' $< > $@ - -$(TOPHAT_GTF): Mm.GRCm38.72.gtf /mnt/ingolialab/ingolia/Genomes/SpikeIn/sac_cer_spike.gtf - cat $^ > $@ - -samples.d: $(PROJECTS) - ls $(addsuffix /Sample_*/*.fastq.gz,$(PROJECTS)) | \ - sed 's,.*Sample_\(.*\)/.*\.fastq\.gz$$,\1_trim.fq: &,' | \ - sed s/ribo_mesc_chx/ribo_chx_mesc/ | \ - sed s/ribo_mesc_emet/ribo_emet_mesc/ \ - > $@ - -trims: $(addsuffix _trim.fq, $(SAMPLES)) -norrnas: $(addsuffix _norrna.fq, $(SAMPLES)) -genomic: $(addsuffix _genome_sorted.bam.bai, $(SAMPLES)) - -all: trims -all: norrnas -all: genomic - -# RIBO LINKER TRIMMING -$(addsuffix _trim.fq, $(NEW_SAMPLES)): %_trim.fq: - mkdir -p Statistics - zcat $^ \ - | fastx_clipper -Q33 -a $(FP_LINKER) -l $(FP_MIN_INSERT) -c -n -v 2>Statistics/$*_clip.txt \ - > $@ - -$(addsuffix _trim.fq, $(OLD_SAMPLES)): %_trim.fq: - mkdir -p Statistics - bzcat $^ \ - | fastx_clipper -Q33 -a $(FP_LINKER) -l $(FP_MIN_INSERT) -c -n -v 2>Statistics/$*_clip.txt \ - > $@ - -# RRNA ALIGNMENTS -RRNA_BOWTIE_ARGS=$(BOWTIE_PARALLEL) --solexa-quals --maqerr=$(RRNA_MAQ_ERR) --seedlen=$(RRNA_SEEDLEN) - -$(addsuffix _norrna.fq, $(RIBO_SAMPLES)):%_norrna.fq: %_trim.fq - mkdir -p Statistics - bowtie $(RRNA_BOWTIE_ARGS) --un $@ --sam $(RRNA_EBWT) $< 2>Statistics/$*_rrna.txt > /dev/null - -# GENOME ALIGNMENTS -TOPHAT_ARGS=$(BOWTIE_PARALLEL) --solexa-quals --GTF $(TOPHAT_GTF) --no-novel-juncs --library-type fr-unstranded $(TOPHAT_KEEP_TEMP) - -$(addsuffix _genome/accepted_hits.bam, $(SAMPLES)): %_genome/accepted_hits.bam: %_norrna.fq $(TOPHAT_EBWT).1.ebwt $(TOPHAT_GTF) - mkdir -p Statistics - $(TOPHAT) $(TOPHAT_ARGS) --output-dir $(dir $@) $(TOPHAT_EBWT) $< 2>Statistics/$*_genome.txt - -%_sorted.bam.bai: %_sorted.bam - $(SAMTOOLS) index $< 2>Statistics/$*_genome_index.txt - -$(addsuffix _genome_sorted.bam, $(SAMPLES)): %_genome_sorted.bam: %_genome/accepted_hits.bam - samtools view -h $^ | \ - grep -E '(NM:i:[01])|(^@)' | \ - samtools view -S -b - > $@ - diff --git a/src/FLOSS_score/Makefile-align-pull b/src/FLOSS_score/Makefile-align-pull deleted file mode 100644 index 37559b1b..00000000 --- a/src/FLOSS_score/Makefile-align-pull +++ /dev/null @@ -1,91 +0,0 @@ -SHELL=/bin/bash -x - -all: - -.SECONDARY: - -include samples.d - -RIBO_SAMPLES=input bound -SAMPLES=$(RIBO_SAMPLES) -PROJECTS=/mnt/ingolialab/FastQ/130628_SN375_0234_AD24HWACXX/FastQ/Project_NI-mikeharris/ \ - /mnt/ingolialab/FastQ/130709_SN375_0236_BD294UACXX/FastQ-i4n3/Project_NI-mikeharris/ - -# RUNTIME CONFIGURATION -BOWTIE_PARALLEL=-p 32 -TOPHAT_KEEP_TEMP=#--keep-tmp - -# LINKER TRIMMING CONFIGURATION -NI_NI_9=CTGTAGGCACCATCAATAGA - -FP_LINKER=$(NI_NI_9) -FP_MIN_INSERT=24 - -# RRNA ALIGNMENT CONFIGURATION -RRNA_MAQ_ERR=60 -RRNA_SEEDLEN=23 -RRNA_MAXREAD=51 - -RRNA_EBWT=/mnt/ingolialab/ingolia/Genomes/SpikeIn/hs-scspike-rrna - -# GENOME ALIGNMENT CONFIGURATION -TOPHAT_EBWT=/mnt/ingolialab/ingolia/Genomes/SpikeIn/hg19-scspike -TOPHAT_GTF=gencode.v17-plus-scspike.gtf - -# BINARIES -TOPHAT=tophat --bowtie1 -BOWTIE=bowtie - -SAMTOOLS=samtools -TAM_TO_BAM=$(SAMTOOLS) view -b -S -h - - -gencode.v17.annotation.gtf.gz: - curl -O 'ftp://ftp.sanger.ac.uk/pub/gencode/release_17/gencode.v17.annotation.gtf.gz' - -gencode.v17.annotation.gtf: gencode.v17.annotation.gtf.gz - gunzip $< - -$(TOPHAT_GTF): gencode.v17.annotation.gtf /mnt/ingolialab/ingolia/Genomes/SpikeIn/sac_cer_spike.gtf - cat $^ > $@ - -samples.d: $(PROJECTS) - ls $(addsuffix /Sample_*/*.fastq.gz,$(PROJECTS)) | \ - sed 's,.*Sample_\(.*\)/.*\.fastq\.gz$$,\1_trim.fq: &,' \ - > $@ - -trims: $(addsuffix _trim.fq, $(SAMPLES)) -norrnas: $(addsuffix _norrna.fq, $(SAMPLES)) -genomic: $(addsuffix _genome_sorted.bam.bai, $(SAMPLES)) - -all: trims -all: norrnas -all: genomic - -# RIBO LINKER TRIMMING -$(addsuffix _trim.fq, $(RIBO_SAMPLES)): %_trim.fq: - mkdir -p Statistics - zcat $^ \ - | fastx_clipper -Q33 -a $(FP_LINKER) -l $(FP_MIN_INSERT) -c -n -v 2>Statistics/$*_clip.txt \ - > $@ - -# RRNA ALIGNMENTS -RRNA_BOWTIE_ARGS=$(BOWTIE_PARALLEL) --solexa-quals --maqerr=$(RRNA_MAQ_ERR) --seedlen=$(RRNA_SEEDLEN) - -$(addsuffix _norrna.fq, $(RIBO_SAMPLES)):%_norrna.fq: %_trim.fq - mkdir -p Statistics - bowtie $(RRNA_BOWTIE_ARGS) --un $@ --sam $(RRNA_EBWT) $< 2>Statistics/$*_rrna.txt > /dev/null - -# GENOME ALIGNMENTS -TOPHAT_ARGS=$(BOWTIE_PARALLEL) --solexa-quals --GTF $(TOPHAT_GTF) --no-novel-juncs --library-type fr-unstranded $(TOPHAT_KEEP_TEMP) - -$(addsuffix _genome/accepted_hits.bam, $(SAMPLES)): %_genome/accepted_hits.bam: %_norrna.fq $(TOPHAT_EBWT).1.ebwt $(TOPHAT_GTF) - mkdir -p Statistics - $(TOPHAT) $(TOPHAT_ARGS) --output-dir $(dir $@) $(TOPHAT_EBWT) $< 2>Statistics/$*_genome.txt - -%_sorted.bam.bai: %_sorted.bam - $(SAMTOOLS) index $< 2>Statistics/$*_genome_index.txt - -$(addsuffix _genome_sorted.bam, $(SAMPLES)): %_genome_sorted.bam: %_genome/accepted_hits.bam - samtools view -h $^ | \ - grep -E '(NM:i:[01])|(^@)' | \ - samtools view -S -b - > $@ diff --git a/src/FLOSS_score/analysis.sh b/src/FLOSS_score/analysis.sh index 240ffaf5..2da4cecc 100644 --- a/src/FLOSS_score/analysis.sh +++ b/src/FLOSS_score/analysis.sh @@ -41,8 +41,8 @@ #rm tmp.sam /Xnfs/lbmcdb/Ricci_team/HIV_project/results/20180326_RP_U937CHX/Results/10_FLOSS_score/U937_CHX_all_hg38_NY5.bam ### Calculating FLOSS scores -R --no-save < ldist-calc-mesc.R +R --no-save < ldist-calc-template.R # FLOSS scores needed to pick lincs in ldist plots #R --no-save < floss-calc-mesc.R ####### ---> this script is now merged in the first one -R --no-save < ldist-plots-mesc.R -R --no-save < floss-plots-mesc.R +R --no-save < ldist-plots-template.R +R --no-save < floss-plots-template.R diff --git a/src/FLOSS_score/count-calc-hek.R b/src/FLOSS_score/count-calc-hek.R deleted file mode 100644 index 39a34a02..00000000 --- a/src/FLOSS_score/count-calc-hek.R +++ /dev/null @@ -1,50 +0,0 @@ -library("rtracklayer") -library("Rsamtools") -library("biomaRt") - -source("../shared/transcript-utils.R") -source("../shared/footprint-assignment.R") -source("../shared/floss.R") - -options(mc.cores=32) - -hekBedFile <- "/mnt/ingolialab/mharr/schsgenome/gencode.v17.annotation.bed" -yeastBedFile <- "/mnt/ingolialab/ingolia/RiboIP/Alignment-mesc-130709/sac_cer_spike.bed" - -boundBamFile <- BamFile("/mnt/ingolialab/mharr/130709_YeastMix/RawData/bound.bam") -inputBamFile <- BamFile("/mnt/ingolialab/mharr/130709_YeastMix/RawData/input.bam") - -aoffsets <- data.frame(asite=c(14,14,14,14,15,15), row.names=seq(26,31)) - -yeastBed <- import(yeastBedFile, format="BED", asRangedData = FALSE) -hekBed <- import(hekBedFile, format="BED", asRangedData = FALSE) - -yeastSize <- regionCountFrame(countSizes(defaultInsets, yeastBed)) -yeastInput <- regionCountFrame(countAligns(aoffsets, defaultInsets, inputBamFile, yeastBed)) -yeastBound <- regionCountFrame(countAligns(aoffsets, defaultInsets, boundBamFile, yeastBed)) - -write.csv(yeastSize, "data/yeast-pulldown-sizes.txt", row.names=TRUE) -write.csv(yeastInput, "data/yeast-pulldown-input.txt", row.names=TRUE) -write.csv(yeastBound, "data/yeast-pulldown-bound.txt", row.names=TRUE) - -yeastInputLDists <- calcTrxRegionLengthDists(inputBamFile, yeastBed) -writeRegionLengthDists(yeastInputLDists, "data/yeast-pulldown-input") - -yeastBoundLDists <- calcTrxRegionLengthDists(boundBamFile, yeastBed) -writeRegionLengthDists(yeastBoundLDists, "data/yeast-pulldown-bound") - -hekSize <- regionCountFrame(countSizes(defaultInsets, hekBed)) -hekInput <- regionCountFrame(countAligns(aoffsets, defaultInsets, inputBamFile, hekBed)) -hekBound <- regionCountFrame(countAligns(aoffsets, defaultInsets, boundBamFile, hekBed)) - -write.csv(hekSize, "data/hek-pulldown-sizes.txt", row.names=TRUE) -write.csv(hekInput, "data/hek-pulldown-input.txt", row.names=TRUE) -write.csv(hekBound, "data/hek-pulldown-bound.txt", row.names=TRUE) - -hekInputLDists <- calcTrxRegionLengthDists(inputBamFile, hekBed) -writeRegionLengthDists(hekInputLDists, "data/hek-pulldown-input") - -hekBoundLDists <- calcTrxRegionLengthDists(boundBamFile, hekBed) -writeRegionLengthDists(hekBoundLDists, "data/hek-pulldown-bound") - - diff --git a/src/FLOSS_score/count-mapped.R b/src/FLOSS_score/count-mapped.R deleted file mode 100644 index 39e80caf..00000000 --- a/src/FLOSS_score/count-mapped.R +++ /dev/null @@ -1,14 +0,0 @@ -options(mc.cores=16) -bamFiles <- c("/mnt/ingolialab/ingolia/RiboIP/Alignment-mesc-130709/ribo_chx_mesc_genome_sorted.bam", - "/mnt/ingolialab/ingolia/RiboIP/Alignment-mesc-130709/ribo_emet_mesc_genome_sorted.bam", - "/mnt/ingolialab/mharr/130709_YeastMix/RawData/bound.bam", - "/mnt/ingolialab/mharr/130709_YeastMix/RawData/input.bam", - "/mnt/ingolialab/ingolia/RiboIP/Alignment-mesc-130709/ribo_harr120_oldmesc_genome_sorted.bam", - "/mnt/ingolialab/ingolia/RiboIP/Alignment-mesc-130709/ribo_harr150_oldmesc_genome_sorted.bam") - -counts <- mclapply(bamFiles, function(bamFile) { - res <- system(sprintf("samtools flagstat %s", bamFile), intern=TRUE) - c(bamFile, res) -}) - -cat(do.call(c, counts), file="data/mapped-count.txt", sep="\n") diff --git a/src/FLOSS_score/count-plot-hek.R b/src/FLOSS_score/count-plot-hek.R deleted file mode 100644 index 1792526e..00000000 --- a/src/FLOSS_score/count-plot-hek.R +++ /dev/null @@ -1,265 +0,0 @@ -library("biomaRt") -library("GenomicRanges") - -source("../shared/transcript-utils.R") -source("../shared/footprint-assignment.R") -source("../shared/floss.R") -source("../shared/plot-utils.R") - -source("human-annotation.R") - -lengthLevels <- seq(defaultLengthMin, defaultLengthMax) - -yeastCount <- read.csv("data/yeast-pulldown-count.txt", row.names=1) -yeastFloss <- read.csv("data/yeast-pulldown-bound-floss.txt", check.names=FALSE, row.names=1) -yeastCount <- cbind(yeastCount, yeastFloss[match(row.names(yeastCount), row.names(yeastFloss)),]) - -hekCount <- read.csv("data/hek-pulldown-count.txt", row.names=1) -hekFloss <- read.csv("data/hek-pulldown-input-floss.txt", check.names=FALSE, row.names=1) -hekCount <- cbind(hekCount, hekFloss[match(row.names(hekCount), row.names(hekFloss)),]) - -xref <- match(row.names(hekCount), annots$ensembl_transcript_id) -table(is.na(xref)) -hekCount <- cbind(hekCount, annots[xref,]) - -hekexpr <- hekCount[hekCount$inputCdsCount > 100 & !is.na(hekCount$inputCdsCount),] -hekmed <- median(log2(hekexpr$boundCdsCount / hekexpr$inputCdsCount), na.rm=T) - -yeastexpr <- yeastCount[yeastCount$inputTrxCount > 100 & !is.na(yeastCount$inputTrxCount),] -yeastmed <- median(log2(yeastexpr$boundTrxCount / yeastexpr$inputTrxCount), na.rm=T) - -pcoding <- hekCount[row.names(hekCount) %in% refCodingIds,] -lincs <- hekCount[row.names(hekCount) %in% cleanLincIds,] -mitoch <- hekCount[row.names(hekCount) %in% mitoCodingIds,] -miscs <- hekCount[row.names(hekCount) %in% miscIds,] -snornas <- hekCount[hekCount$transcript_biotype == "snoRNA" & !is.na(hekCount$transcript_biotype),] - -pcodingPresent <- pcoding[pcoding$cdsNRead >= 10,] -utr5Present <- pcoding[pcoding$utr5NRead >= 10,] -lincsPresent <- lincs[lincs$trxNRead >= 10,] - -cutoff <- flossCutoffs(pcoding$cdsNRead, pcoding$cdsScore) -lincsPresent$trxClassify <- cutoff$classify(lincsPresent$trxNRead, lincsPresent$trxScore) - -plotBase <- function() { - plot(log10(pcoding$cdsNRead), pcoding$cdsScore, xlim=c(1,log10(2e5)), ylim=c(0,1), - pch=".", col=solarGrey01, - axes=F, xlab="Total Reads", ylab="Fragment Length Distribution Score", cex.lab=1.33) - axis(1, log10(10**seq(1,5,1)), labels=c("10", "100", "1k", "10k", "100k"), cex.axis=1) - axis(1, log10(c(sapply(10**seq(1,5,1), minor_decade))), labels=F, tcl=-0.5) - axis(2, seq(0,1,0.2)) - - lines(seq(1,5,0.1), predict(cutoff$extremeLoess, seq(1,5,0.1))) -} - -pdf("hek-input-ldist-rplot.pdf", useDingbats=F) - -plotBase() -points(log10(mitoch$cdsNRead), mitoch$cdsScore, - pch=21, cex=0.75, col=solarCyan, bg=lighten(solarCyan)) -points(log10(miscs$trxNRead), miscs$trxScore, - pch=21, cex=0.75, col=solarYellow, bg=lighten(solarYellow)) - -plotBase() -title(main="5' leaders") -points(log10(utr5Present$utr5NRead), utr5Present$utr5Score, - pch=21, cex=0.75, col=solarRed, bg=lighten(solarRed)) - -plotBase() -title(main="lincRNAs") -points(log10(lincsPresent[lincsPresent$trxClassify != "Extreme","trxNRead"]), - lincsPresent[lincsPresent$trxClassify != "Extreme", "trxScore"], - pch=21, cex=0.75, col=solarMagenta, bg=lighten(solarMagenta)) -points(log10(lincsPresent[lincsPresent$trxClassify == "Extreme","trxNRead"]), - lincsPresent[lincsPresent$trxClassify == "Extreme", "trxScore"], - pch=21, cex=0.75, col=solarCyan, bg=lighten(solarCyan)) - -plotBase() -title(main="snoRNAs") -points(log10(snornas$trxNRead), snornas$trxScore, - pch=21, cex=0.75, col=solarGreen, bg=lighten(solarGreen)) - -dev.off() - -write.csv(pcoding[,c("cdsNRead", "cdsScore")], "tables/hek-input-floss-refcoding-cds.txt") -write.csv(mitoch[,c("cdsNRead", "cdsScore")], "tables/hek-input-floss-mitoch-cds.txt") -write.csv(miscs[,c("trxNRead", "trxScore")], "tables/hek-input-floss-miscs.txt") -write.csv(pcoding[,c("utr5NRead", "utr5Score")], "tables/hek-input-floss-refcoding-utr5.txt") -write.csv(lincsPresent[,c("trxNRead", "trxScore", "trxClassify")], "tables/hek-input-floss-lincs.txt") -write.csv(snornas[,c("trxNRead", "trxScore")], "tables/hek-input-floss-snornas.txt") - -pdf("yeast-bound-ldist-rplot.pdf", useDingbats=F) - -yeastInputRefLDist <- normLDist(read.csv("data/yeast-pulldown-input-ref-ldist.txt", check.names=F, row.names=1)[,1]) -yeastBoundRefLDist <- normLDist(read.csv("data/yeast-pulldown-bound-ref-ldist.txt", check.names=F, row.names=1)[,1]) -hekInputRefLDist <- normLDist(read.csv("data/hek-pulldown-input-ref-ldist.txt", check.names=F, row.names=1)[,1]) -hekBoundRefLDist <- normLDist(read.csv("data/hek-pulldown-bound-ref-ldist.txt", check.names=F, row.names=1)[,1]) - -#plot(lengthLevels, yeastInputRefLDist, type="l", col=solarGreen, lwd=2, -# xlim=c(26,34), ylim=c(0,0.5), xlab="Fragment Length") -#lines(lengthLevels, yeastBoundRefLDist, type="l", col=solarBlue, lwd=2) -#lines(lengthLevels, hekInputRefLDist, type="l", col=solarBlack, lwd=2) -#lines(lengthLevels, hekBoundRefLDist, type="l", col=solarYellow, lwd=2) - -plot(lengthLevels, cumsum(yeastInputRefLDist), type="l", col=solarGreen, lwd=2, - xlim=c(26,34), ylim=c(0,1), xlab="Fragment Length") -lines(lengthLevels, cumsum(yeastBoundRefLDist), type="l", col=solarBlue, lwd=2) -lines(lengthLevels, cumsum(hekInputRefLDist), type="l", col=solarBlack, lwd=2) -lines(lengthLevels, cumsum(hekBoundRefLDist), type="l", col=solarYellow, lwd=2) -legend(x = "bottomright", legend=c("Yeast Input", "Yeast Bound", "HEK Input", "HEK Bound"), - col=c(solarGreen, solarBlue, solarBlack, solarYellow), - lwd=2) - -plot(log10(yeastCount$vsyeastNRead), yeastCount$vsyeastScore, xlim=c(1,log10(2e5)), ylim=c(0,1), - pch=".", col=solarBlue, - axes=F, xlab="Total Reads", ylab="FLOSS vs Yeast Input", cex.lab=1.33) -axis(1, log10(10**seq(1,5,1)), labels=c("10", "100", "1k", "10k", "100k"), cex.axis=1) -axis(1, log10(c(sapply(10**seq(1,5,1), minor_decade))), labels=F, tcl=-0.5) -axis(2, seq(0,1,0.2)) -title(main="Yeast mRNAs in Bound") - -plot(log10(yeastCount$vshekNRead), yeastCount$vshekScore, xlim=c(1,log10(2e5)), ylim=c(0,1), - pch=".", col=solarMagenta, - axes=F, xlab="Total Reads", ylab="FLOSS vs HEK Input", cex.lab=1.33) -axis(1, log10(10**seq(1,5,1)), labels=c("10", "100", "1k", "10k", "100k"), cex.axis=1) -axis(1, log10(c(sapply(10**seq(1,5,1), minor_decade))), labels=F, tcl=-0.5) -axis(2, seq(0,1,0.2)) -title(main="Yeast mRNAs in Bound") - -plot(log10(yeastCount$vsyeastNRead), yeastCount$vsyeastScore, xlim=c(1,log10(2e5)), ylim=c(0,1), - pch=".", col=solarBlue, - axes=F, xlab="Total Reads", ylab="FLOSS", cex.lab=1.33) -points(log10(yeastCount$vshekNRead), yeastCount$vshekScore, pch=".", col=solarMagenta) -axis(1, log10(10**seq(1,5,1)), labels=c("10", "100", "1k", "10k", "100k"), cex.axis=1) -axis(1, log10(c(sapply(10**seq(1,5,1), minor_decade))), labels=F, tcl=-0.5) -axis(2, seq(0,1,0.2)) -legend(x = "topright", legend=c("vs Yeast Input", "vs HEK Input"), - col=c(solarBlue, solarMagenta), pt.bg=c(solarBlue, solarMagenta), pch=21) -title(main="Yeast mRNAs in Bound") - -dev.off() - -plotBase <- function() { - plot(log10(pcodingPresent$inputCdsCount), log2(pcodingPresent$boundCdsCount / pcodingPresent$inputCdsCount), xlim=c(1,log10(2e5)), ylim=c(-6,6), - pch=".", col=solarGrey01, - axes=F, xlab="Input Reads", ylab="Bound / Input", cex.lab=1.33) - axis(1, log10(10**seq(1,5,1)), labels=c("10", "100", "1k", "10k", "100k"), cex.axis=1) - axis(1, log10(c(sapply(10**seq(1,5,1), minor_decade))), labels=F, tcl=-0.5) - axis(2, seq(-6,6,3), labels=c("1/64", "1/8", "1", "8", "64"), cex.axis=1) - axis(2, seq(-6,6), labels=F, tcl=-0.5) -} - -pdf("hek-pulldown-enrich-rplot.pdf", useDingbats=F) - -plotBase() -abline(h=hekmed, col=lighten(solarGrey01), lwd=2) -points(log10(yeastCount$inputTrxCount), log2(yeastCount$boundTrxCount / yeastCount$inputTrxCount), - pch=".", col=solarBlue) -abline(h=yeastmed, col=lighten(solarBlue), lwd=2) -acc1=yeastCount["YNR016C",] -points(log10(acc1$inputTrxCount), log2(acc1$boundTrxCount / acc1$inputTrxCount), - pch=21, cex=0.75, col=solarBlue, bg=lighten(solarBlue)) -text(log10(acc1$inputTrxCount), log2(acc1$boundTrxCount / acc1$inputTrxCount), - labels="ACC1", col=solarBlue, pos=4) -text(4, (yeastmed + hekmed), labels=sprintf("%0.1fx\n", 2**(hekmed - yeastmed)), col=solarBlue, cex=1.5, pos=4) -text(4,5,labels="HEK CDSes", col=solarGrey01, cex=1.5) -text(4,-5,labels="Yeast CDSes", col=solarBlue, cex=1.5) - -plotBase() -points(log10(mitoch$inputCdsCount), log2(mitoch$boundCdsCount / mitoch$inputCdsCount), - pch=21, cex=0.75, col=solarCyan, bg=lighten(solarCyan)) -points(log10(miscs$inputTrxCount), log2(miscs$boundTrxCount / miscs$inputTrxCount), - pch=21, cex=0.75, col=solarYellow, bg=lighten(solarYellow)) -text(log10(miscs[miscs$inputTrxCount>100,]$inputTrxCount), - log2(miscs[miscs$inputTrxCount>100,]$boundTrxCount / miscs[miscs$inputTrxCount>100,]$inputTrxCount), - labels=miscs[miscs$inputTrxCount>100,]$external_gene_id, - col=solarYellow, pos=4, cex=0.75) -text(4,4,labels="ncRNAs", col=solarYellow, cex=1.5) -text(4,3,labels="mito CDSes", col=solarCyan, cex=1.5) - -plotBase() -points(log10(snornas$inputTrxCount), log2(snornas$boundTrxCount / snornas$inputTrxCount), - pch=21, cex=0.50, col=solarGreen, bg=lighten(solarGreen)) -text(4,5,labels="snoRNAs", col=solarGreen, cex=1.5) - -plotBase() -title(main="5' Leaders") -points(log10(utr5Present$inputUtr5Count), log2(utr5Present$boundUtr5Count / utr5Present$inputUtr5Count), - pch=21, cex=0.33, col=solarRed, bg=lighten(solarRed)) -text(4,5,labels="5' leaders", col=solarRed, cex=1.5) - -plotBase() -lincsGood <- lincsPresent[lincsPresent$trxClassify != "Extreme",] -points(log10(lincsGood$inputTrxCount), log2(lincsGood$boundTrxCount / lincsGood$inputTrxCount), - pch=21, cex=0.75, col=solarMagenta, bg=lighten(solarMagenta)) -lincsExtreme <- lincsPresent[lincsPresent$trxClassify == "Extreme",] -points(log10(lincsExtreme$inputTrxCount), log2(lincsExtreme$boundTrxCount / lincsExtreme$inputTrxCount), - pch=21, cex=0.75, col=solarCyan, bg=lighten(solarCyan)) -lincsLabel <- lincsExtreme[lincsExtreme$inputTrxCount > 100,] -text(log10(lincsLabel$inputTrxCount), log2(lincsLabel$boundTrxCount / lincsLabel$inputTrxCount), - labels=lincsLabel$external_gene_id, col=solarCyan, pos=4, cex=0.75) -text(4,5,labels="lincRNAs", col=solarMagenta, cex=1.5) - -dev.off() - -write.csv(pcodingPresent[,c("inputCdsCount", "boundCdsCount", "cdsScore")], - "tables/hek-pulldown-refcoding-cds.txt") -write.csv(yeastCount[,c("inputTrxCount", "boundTrxCount")], "tables/hek-pulldown-yeast.txt") -write.csv(mitoch[,c("inputCdsCount", "boundCdsCount", "cdsScore")], - "tables/hek-pulldown-mitoch-cds.txt") -write.csv(miscs[,c("inputTrxCount", "boundTrxCount", "trxScore")], - "tables/hek-pulldown-miscs.txt") -write.csv(snornas[,c("inputTrxCount", "boundTrxCount", "trxScore")], - "tables/hek-pulldown-snornas.txt") -write.csv(utr5Present[,c("inputUtr5Count", "boundUtr5Count", "utr5Score")], - "tables/hek-pulldown-utr5s.txt") -write.csv(lincsPresent[,c("inputTrxCount", "boundTrxCount", "trxScore", "trxClassify")], - "tables/hek-pulldown-lincs.txt") - -write.csv(lincsPresent[,c("ensembl_gene_id", "inputTrxCount", "boundTrxCount", "trxNRead", "trxScore", "trxClassify", "external_gene_id", "external_gene_db", "gene_biotype", "transcript_biotype", "chromosome_name", "start_position", "end_position", "strand", "description")], - "tables/hek-pulldown-lincs-annotated.txt") - -l2cdf <- function(x0) { l2x <- log2(x0[!is.na(x0)]) - cdf <- data.frame( x = l2x[order(l2x)], n = seq(1,length(l2x))/length(l2x)) - cdf[ cdf$x >= -5 & cdf$x < 5, ] } - -pdf("hek-pulldown-enrich-chist-rplot.pdf", useDingbats=F) - -plot(l2cdf(with(pcodingPresent, ifelse(inputCdsCount >= 100, boundCdsCount / inputCdsCount, NA))), - xlim=c(-4,4), ylim=c(0,1), - type="l", col=solarGrey01, lwd=4) -lines(l2cdf(with(yeastCount, ifelse(inputTrxCount >= 100, boundTrxCount / inputTrxCount, NA))), - col=solarBlue, lwd=4) -lines(l2cdf(with(miscs, ifelse(inputTrxCount >= 100, boundTrxCount / inputTrxCount, NA))), - col=solarYellow, lwd=4) - -plot(l2cdf(with(pcodingPresent, ifelse(inputCdsCount >= 100, boundCdsCount / inputCdsCount, NA))), - xlim=c(-4,4), ylim=c(0,1), - type="l", col=solarGrey01, lwd=4) -title(main="5' Leaders") -lines(l2cdf(with(miscs, ifelse(inputTrxCount >= 100, boundTrxCount / inputTrxCount, NA))), - col=solarYellow, lwd=4) -lines(l2cdf(with(utr5Present, ifelse(inputUtr5Count >= 100, boundUtr5Count / inputUtr5Count, NA))), - col=solarRed, lwd=4) - -plot(l2cdf(with(pcodingPresent, ifelse(inputCdsCount >= 100, boundCdsCount / inputCdsCount, NA))), - xlim=c(-4,4), ylim=c(0,1), - type="l", col=solarGrey01, lwd=4) -title(main="snornas") -lines(l2cdf(with(miscs, ifelse(inputTrxCount >= 100, boundTrxCount / inputTrxCount, NA))), - col=solarYellow, lwd=4) -lines(l2cdf(with(snornas, ifelse(inputTrxCount >= 100, boundTrxCount / inputTrxCount, NA))), - col=solarGreen, lwd=4) - -plot(l2cdf(with(pcodingPresent, ifelse(inputCdsCount >= 100, boundCdsCount / inputCdsCount, NA))), - xlim=c(-4,4), ylim=c(0,1), - type="l", col=solarGrey01, lwd=4) -title(main="lincRNAs") -lines(l2cdf(with(miscs, ifelse(inputTrxCount >= 100, boundTrxCount / inputTrxCount, NA))), - col=solarYellow, lwd=4) -lines(l2cdf(with(lincsPresent, ifelse(inputTrxCount >= 100 & trxClassify != "Extreme", boundTrxCount / inputTrxCount, NA))), - col=solarMagenta, lwd=4) -lines(l2cdf(with(lincsPresent, ifelse(inputTrxCount >= 100 & trxClassify == "Extreme", boundTrxCount / inputTrxCount, NA))), - col=solarCyan, lwd=4) - -dev.off() diff --git a/src/FLOSS_score/count-score-hek.R b/src/FLOSS_score/count-score-hek.R deleted file mode 100644 index 0de05eb4..00000000 --- a/src/FLOSS_score/count-score-hek.R +++ /dev/null @@ -1,92 +0,0 @@ -#library("biomaRt") - -source("../shared/transcript-utils.R") -source("../shared/footprint-assignment.R") -source("../shared/floss.R") - -options(mc.cores=32) - -yeastInputCount <- read.csv("data/yeast-pulldown-input.txt", row.names=1) -yeastBoundCount <- read.csv("data/yeast-pulldown-bound.txt", row.names=1) - -yeastCount <- data.frame(inputTrxCount = yeastInputCount$trx, - inputCdsCount = yeastInputCount$cds, - inputUtr5Count = yeastInputCount$utr5, - inputUtr3Count = yeastInputCount$utr3, - row.names = row.names(yeastInputCount) ) -yeastCount[row.names(yeastBoundCount),"boundTrxCount"] <- yeastBoundCount$trx -yeastCount[row.names(yeastBoundCount),"boundCdsCount"] <- yeastBoundCount$cds -yeastCount[row.names(yeastBoundCount),"boundUtr5Count"] <- yeastBoundCount$utr5 -yeastCount[row.names(yeastBoundCount),"boundUtr3Count"] <- yeastBoundCount$utr3 - -write.csv(yeastCount, "data/yeast-pulldown-count.txt", row.names=TRUE) - -source("human-annotation.R") - -hekInputCount <- read.csv("data/hek-pulldown-input.txt", row.names=1) -hekBoundCount <- read.csv("data/hek-pulldown-bound.txt", row.names=1) - -row.names(hekInputCount) <- sub("[.][0-9]*$", "", row.names(hekInputCount)) -row.names(hekBoundCount) <- sub("[.][0-9]*$", "", row.names(hekBoundCount)) - -hekCount <- data.frame( inputTrxCount = hekInputCount$trx, - inputCdsCount = hekInputCount$cds, - inputUtr5Count = hekInputCount$utr5, - inputUtr3Count = hekInputCount$utr3, - row.names = row.names(hekInputCount) ) -hekCount[row.names(hekBoundCount),"boundTrxCount"] <- hekBoundCount$trx -hekCount[row.names(hekBoundCount),"boundCdsCount"] <- hekBoundCount$cds -hekCount[row.names(hekBoundCount),"boundUtr5Count"] <- hekBoundCount$utr5 -hekCount[row.names(hekBoundCount),"boundUtr3Count"] <- hekBoundCount$utr3 - -write.csv(hekCount, "data/hek-pulldown-count.txt", row.names=TRUE) - -inputHekLDists <- readRegionLengthDists("data/hek-pulldown-input") -boundHekLDists <- readRegionLengthDists("data/hek-pulldown-bound") - -inputHekLDists <- lapply(inputHekLDists, - function (r) { row.names(r) <- sub("[.][0-9]*$", "", row.names(r)); r }) -boundHekLDists <- lapply(boundHekLDists, - function (r) { row.names(r) <- sub("[.][0-9]*$", "", row.names(r)); r }) - -inputHekRefLDist <- colSums(inputHekLDists$cds[refCodingIds,], na.rm=TRUE) -write.csv(inputHekRefLDist, "data/hek-pulldown-input-ref-ldist.txt", row.names=TRUE) - -boundHekRefLDist <- colSums(boundHekLDists$cds[refCodingIds,], na.rm=TRUE) -write.csv(boundHekRefLDist, "data/hek-pulldown-bound-ref-ldist.txt", row.names=TRUE) - -inputHekLDFloss <- flossRegionLengthDists( inputHekLDists, inputHekRefLDist ) -inputHekFloss <- flossRegionScores( inputHekLDFloss ) -write.csv(inputHekFloss, "data/hek-pulldown-input-floss.txt", row.names=TRUE) - -boundHekLDFloss <- flossRegionLengthDists( boundHekLDists, boundHekRefLDist ) -boundHekFloss <- flossRegionScores( boundHekLDFloss ) -write.csv(boundHekFloss, "data/hek-pulldown-bound-floss.txt", row.names=TRUE) - - -inputYeastLDists <- readLengthDists("data/yeast-pulldown-input-trx") -boundYeastLDists <- readLengthDists("data/yeast-pulldown-bound-trx") - -inputYeastRefLDist <- colSums(inputYeastLDists, na.rm=TRUE) -write.csv(inputYeastRefLDist, "data/yeast-pulldown-input-ref-ldist.txt", row.names=TRUE) - -boundYeastRefLDist <- colSums(boundYeastLDists, na.rm=TRUE) -write.csv(boundYeastRefLDist, "data/yeast-pulldown-bound-ref-ldist.txt", row.names=TRUE) - -inputYeastVsYeastFloss <- flossLengthDist( inputYeastLDists, inputYeastRefLDist ) -inputYeastVsHEKFloss <- flossLengthDist( inputYeastLDists, inputHekRefLDist ) -inputYeastFloss <- data.frame(vsyeastNRead = inputYeastVsYeastFloss$NRead, - vsyeastScore = inputYeastVsYeastFloss$Score, - vshekNRead = inputYeastVsHEKFloss$NRead, - vshekScore = inputYeastVsHEKFloss$Score, - row.names = row.names(inputYeastVsYeastFloss)) -write.csv(inputYeastFloss, "data/yeast-pulldown-input-floss.txt", row.names=TRUE) - -boundYeastVsYeastFloss <- flossLengthDist( boundYeastLDists, inputYeastRefLDist ) -boundYeastVsHEKFloss <- flossLengthDist( boundYeastLDists, inputHekRefLDist ) -boundYeastFloss <- data.frame(vsyeastNRead = boundYeastVsYeastFloss$NRead, - vsyeastScore = boundYeastVsYeastFloss$Score, - vshekNRead = boundYeastVsHEKFloss$NRead, - vshekScore = boundYeastVsHEKFloss$Score, - row.names = row.names(boundYeastVsYeastFloss)) -write.csv(boundYeastFloss, "data/yeast-pulldown-bound-floss.txt", row.names=TRUE) diff --git a/src/FLOSS_score/floss-calc-mesc.R b/src/FLOSS_score/floss-calc-mesc.R deleted file mode 100644 index df603a13..00000000 --- a/src/FLOSS_score/floss-calc-mesc.R +++ /dev/null @@ -1,19 +0,0 @@ -library("biomaRt") -library("GenomicRanges") - -setwd("~/GoogleDrive/Post-Doc_ENS/Documents/RMI2/gitlab/HIV_project") -source("./src/FLOSS_score/transcript-utils.R") -source("./src/FLOSS_score/footprint-assignment.R") -source("./src/FLOSS_score/floss.R") - -source("./src/FLOSS_score/human-annotation.R") - - -chxLDists <- readRegionLengthDists("results/FLOSS_score/mesc-chx") - -chxRef <- colSums(chxLDists$cds[refCodingIds,], na.rm=TRUE) -write.csv(chxRef, "results/FLOSS_score/mesc-chx-ref-ldist.txt", row.names=TRUE) - -chxLDFloss <- flossRegionLengthDists( chxLDists, chxRef ) -chxFloss <- flossRegionScores( chxLDFloss ) -write.csv(chxFloss, "results/FLOSS_score/mesc-chx-floss.txt", row.names=TRUE) diff --git a/src/FLOSS_score/floss-plots-mesc_IP.R b/src/FLOSS_score/floss-plots-mesc_IP.R deleted file mode 100644 index e622c2a7..00000000 --- a/src/FLOSS_score/floss-plots-mesc_IP.R +++ /dev/null @@ -1,144 +0,0 @@ -library("biomaRt") -library("GenomicRanges") -library("zoo") - -setwd("~/GoogleDrive/Post-Doc_ENS/Documents/RMI2/gitlab/HIV_project") -#setwd("/Xnfs/lbmcdb/Ricci_team/HIV_project") - -source("./src/FLOSS_score/footprint-assignment.R") -source("./src/FLOSS_score/floss.R") -source("./src/FLOSS_score/human-annotation.R") -source("./src/FLOSS_score/plot-utils.R") - -hg38_NY5Floss <- read.csv("results/10_FLOSS_score/hg38_NY5_chx-floss_IP.txt", check.names=FALSE, row.names=1) - -xref <- match(row.names(hg38_NY5Floss), annots$ensembl_transcript_id) -table(is.na(xref)) -trxs <- cbind(hg38_NY5Floss, annots[xref,]) -trxs$name <- row.names(trxs) - -pcoding = trxs[trxs$name %in% codingIds & !(trxs$name %in% mitoIds) & !(trxs$name %in% dirtyPCodingIds),] -mitoch = trxs[trxs$name %in% codingIds & (trxs$name %in% mitoIds),] -lincs = trxs[trxs$name %in% lincIds ,] -snolincs = trxs[trxs$name %in% snohgIds & trxs$transcript_biotype == "lincRNA",] -miscs = trxs[trxs$name %in% miscIds,] -hiv_genes <- c("GAG","POL","VIF", "VPU", "VPR", "ENV", "REV", "VIF", "TAT", "NEF") -hivCds = trxs[trxs$name %in% hiv_genes,] -hivLtr = trxs[trxs$name %in% "LTR",] - -hg38_NY5Cutoffs <- flossCutoffs(pcoding$cdsNRead, pcoding$cdsScore) -predx <- seq(1,6,0.1) - -plotBase <- function(title) { - plot(log10(pcoding$cdsNRead), pcoding$cdsScore, xlim=c(0,6), ylim=c(0,1), - pch=".", col=solarGrey01, - axes=F, xlab="Total Reads", ylab="Fragment Length Distribution Score", cex.lab=1.33, main = title) - axis(1, log10(10**seq(0,6,1)), labels=c("1", "10", "100", "1k", "10k", "100k", "1M"), cex.axis=1) - axis(1, log10(c(sapply(10**seq(0,5,1), minor_decade))), labels=F, tcl=-0.5) - axis(2, seq(0,1,0.2)) -} - -plotDrug <- function() { - plotBase("Mito CDS, mics and lincs") - points(log10(mitoch$cdsNRead), mitoch$cdsScore, - pch=21, cex=0.75, col=solarCyan, bg=lighten(solarCyan)) - points(log10(miscs$trxNRead), miscs$trxScore, - pch=21, cex=0.75, col=solarYellow, bg=lighten(solarYellow)) - text(log10(miscs$trxNRead), miscs$trxScore, - labels=miscs$external_gene_id, col=solarYellow, pos=4 ) - points(log10(lincs$trxNRead), lincs$trxScore, - pch=21, cex=0.75, col=solarMagenta, bg=lighten(solarMagenta)) - - plotBase("5'UTR of protein coding mRNA") - points(log10(pcoding$utr5NRead), pcoding$utr5Score - , pch=21, cex=0.25, col=solarRed, bg=lighten(solarRed)) -} - -pdf("results/10_FLOSS_score/hg38_NY5-floss-rplot_IP.pdf", useDingbats=F) -plotDrug() - -plotBase("5'UTR") -lines(predx, predict(hg38_NY5Cutoffs$extremeLoess, predx)) - -pcoding$classifyUtr5 <- hg38_NY5Cutoffs$classify(pcoding$utr5NRead, pcoding$utr5Score) -utr5Good <- pcoding[pcoding$classifyUtr5 != "Extreme" & !is.na(pcoding$classifyUtr5),] -utr5Bad <- pcoding[pcoding$classifyUtr5 == "Extreme" & !is.na(pcoding$classifyUtr5),] - -points(log10(utr5Good$utr5NRead), utr5Good$utr5Score - , pch=21, cex=0.25, col=solarRed, bg=lighten(solarRed)) -points(log10(utr5Bad$utr5NRead), utr5Bad$utr5Score - , pch=21, cex=0.25, col=solarCyan, bg=lighten(solarCyan)) - -plotBase("lincs") -lines(predx, predict(hg38_NY5Cutoffs$extremeLoess, predx)) - -lincs$classify <- hg38_NY5Cutoffs$classify(lincs$trxNRead, lincs$trxScore) -lincsGood <- lincs[lincs$classify != "Extreme" & !is.na(lincs$classify),] -lincsBad <- lincs[lincs$classify == "Extreme" & !is.na(lincs$classify),] - -points(log10(lincsGood$trxNRead), lincsGood$trxScore, - pch=21, cex=0.25, col=solarMagenta, bg=lighten(solarMagenta)) -points(log10(lincsBad$trxNRead), lincsBad$trxScore, - pch=21, cex=0.25, col=solarGreen, bg=lighten(solarGreen)) - -plotBase("HIV CDS") -lines(predx, predict(hg38_NY5Cutoffs$extremeLoess, predx)) - -hivCds$classify <- hg38_NY5Cutoffs$classify(hivCds$trxNRead, hivCds$trxScore) -hivCdsGood <- hivCds[hivCds$classify != "Extreme" & !is.na(hivCds$classify),] -hivCdsBad <- hivCds[hivCds$classify == "Extreme" & !is.na(hivCds$classify),] - -points(log10(hivCdsGood$trxNRead), hivCdsGood$trxScore - , pch=21, cex=0.75, col=solarRed, bg=lighten(solarRed)) -points(log10(hivCdsBad$trxNRead), hivCdsBad$trxScore - , pch=21, cex=0.75, col=solarCyan, bg=lighten(solarCyan)) - -plotBase("HIV LTR") -lines(predx, predict(hg38_NY5Cutoffs$extremeLoess, predx)) - -hivLtr$classify <- hg38_NY5Cutoffs$classify(hivLtr$trxNRead, hivLtr$trxScore) -hivLtrGood <- hivLtr[hivLtr$classify != "Extreme" & !is.na(hivLtr$classify),] -hivLtrBad <- hivLtr[hivLtr$classify == "Extreme" & !is.na(hivLtr$classify),] - -points(log10(hivLtrGood$trxNRead), hivLtrGood$trxScore - , pch=21, cex=0.75, col=solarRed, bg=lighten(solarRed)) -points(log10(hivLtrBad$trxNRead), hivLtrBad$trxScore - , pch=21, cex=0.75, col=solarCyan, bg=lighten(solarCyan)) - -plotBase("HIV") -lines(predx, predict(hg38_NY5Cutoffs$extremeLoess, predx)) -points(log10(hivCdsGood$trxNRead), hivCdsGood$trxScore - , pch=21, cex=0.75, col=solarRed, bg=lighten(solarRed)) -points(log10(hivCdsBad$trxNRead), hivCdsBad$trxScore - , pch=21, cex=0.75, col=solarCyan, bg=lighten(solarCyan)) -points(log10(hivLtrGood$trxNRead), hivLtrGood$trxScore - , pch=21, cex=0.75, col=solarRed, bg=lighten(solarGreen)) - -dev.off() - -pcoding$classifyCds <- hg38_NY5Cutoffs$classify(pcoding$cdsNRead, pcoding$cdsScore) -table(pcoding[pcoding$cdsNRead > 100, "classifyCds"]) -sum(pcoding[pcoding$cdsNRead > 100, "classifyCds"] != "Extreme", na.rm=TRUE) / sum(!is.na(pcoding[pcoding$cdsNRead > 100, "classifyCds"])) - -table(pcoding[pcoding$utr5NRead > 100, "classifyUtr5"]) -sum(pcoding[pcoding$utr5NRead > 100, "classifyUtr5"] != "Extreme", na.rm=TRUE) / sum(!is.na(pcoding[pcoding$utr5NRead > 100, "classifyUtr5"])) - -table(lincs[lincs$trxNRead > 100, "classify"]) -sum(lincs[lincs$trxNRead > 100, "classify"] != "Extreme", na.rm=TRUE) / sum(!is.na(lincs[lincs$trxNRead > 100, "classify"])) - -write.csv(pcoding[,c("cdsNRead", "cdsScore", "classifyCds")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-refcoding-cds_IP.txt") -write.csv(pcoding[,c("utr5NRead", "utr5Score", "classifyUtr5")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-refcoding-utr5_IP.txt") -write.csv(lincs[,c("trxNRead", "trxScore", "classify")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-lincs_IP.txt") -write.csv(mitoch[,c("cdsNRead", "cdsScore")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-mitoch_IP.txt") -write.csv(miscs[,c("trxNRead", "trxScore")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-miscs_IP.txt") -write.csv(hivCds[,c("trxNRead", "trxScore")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-hivCds_IP.txt") -write.csv(hivLtr[,c("trxNRead", "trxScore")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-hivLtr_IP.txt") -write.csv(lincs[,c("ensembl_gene_id", "trxNRead", "trxScore", "classify", "external_gene_id", "external_gene_db", "gene_biotype", "transcript_biotype", "chromosome_name", "start_position", "end_position", "strand", "description")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-lincs-annotated_IP.txt") diff --git a/src/FLOSS_score/floss-plots-mesc.R b/src/FLOSS_score/floss-plots-template.R similarity index 55% rename from src/FLOSS_score/floss-plots-mesc.R rename to src/FLOSS_score/floss-plots-template.R index cf991cfc..197c68c2 100644 --- a/src/FLOSS_score/floss-plots-mesc.R +++ b/src/FLOSS_score/floss-plots-template.R @@ -2,19 +2,17 @@ library("biomaRt") library("GenomicRanges") library("zoo") -setwd("~/GoogleDrive/Post-Doc_ENS/Documents/RMI2/gitlab/HIV_project") -#setwd("/Xnfs/lbmcdb/Ricci_team/HIV_project") - source("./src/FLOSS_score/footprint-assignment.R") source("./src/FLOSS_score/floss.R") -source("./src/FLOSS_score/human-annotation.R") +source("./src/FLOSS_score/mouse-annotation.R") source("./src/FLOSS_score/plot-utils.R") -hg38_NY5Floss <- read.csv("results/10_FLOSS_score/hg38_NY5_chx-floss.txt", check.names=FALSE, row.names=1) +Floss <- read.csv("results/chx-floss.txt", check.names=FALSE, row.names=1) +row.names(Floss) <- gsub("\\.[[:digit:]]*", "", row.names(Floss)) -xref <- match(row.names(hg38_NY5Floss), annots$ensembl_transcript_id) +xref <- match(row.names(Floss), annots$ensembl_transcript_id) table(is.na(xref)) -trxs <- cbind(hg38_NY5Floss, annots[xref,]) +trxs <- cbind(Floss, annots[xref,]) trxs$name <- row.names(trxs) pcoding = trxs[trxs$name %in% codingIds & !(trxs$name %in% mitoIds) & !(trxs$name %in% dirtyPCodingIds),] @@ -22,11 +20,8 @@ mitoch = trxs[trxs$name %in% codingIds & (trxs$name %in% mitoIds),] lincs = trxs[trxs$name %in% lincIds ,] snolincs = trxs[trxs$name %in% snohgIds & trxs$transcript_biotype == "lincRNA",] miscs = trxs[trxs$name %in% miscIds,] -hiv_genes <- c("GAG","POL","VIF", "VPU", "VPR", "ENV", "REV", "VIF", "TAT", "NEF") -hivCds = trxs[trxs$name %in% hiv_genes,] -hivLtr = trxs[trxs$name %in% "LTR",] -hg38_NY5Cutoffs <- flossCutoffs(pcoding$cdsNRead, pcoding$cdsScore) +Cutoffs <- flossCutoffs(pcoding$cdsNRead, pcoding$cdsScore) predx <- seq(1,6,0.1) plotBase <- function(title) { @@ -58,9 +53,9 @@ pdf("results/10_FLOSS_score/hg38_NY5-floss-rplot.pdf", useDingbats=F) plotDrug() plotBase("5'UTR") -lines(predx, predict(hg38_NY5Cutoffs$extremeLoess, predx)) +lines(predx, predict(Cutoffs$extremeLoess, predx)) -pcoding$classifyUtr5 <- hg38_NY5Cutoffs$classify(pcoding$utr5NRead, pcoding$utr5Score) +pcoding$classifyUtr5 <- Cutoffs$classify(pcoding$utr5NRead, pcoding$utr5Score) utr5Good <- pcoding[pcoding$classifyUtr5 != "Extreme" & !is.na(pcoding$classifyUtr5),] utr5Bad <- pcoding[pcoding$classifyUtr5 == "Extreme" & !is.na(pcoding$classifyUtr5),] @@ -70,9 +65,9 @@ points(log10(utr5Bad$utr5NRead), utr5Bad$utr5Score , pch=21, cex=0.25, col=solarCyan, bg=lighten(solarCyan)) plotBase("lincs") -lines(predx, predict(hg38_NY5Cutoffs$extremeLoess, predx)) +lines(predx, predict(Cutoffs$extremeLoess, predx)) -lincs$classify <- hg38_NY5Cutoffs$classify(lincs$trxNRead, lincs$trxScore) +lincs$classify <- Cutoffs$classify(lincs$trxNRead, lincs$trxScore) lincsGood <- lincs[lincs$classify != "Extreme" & !is.na(lincs$classify),] lincsBad <- lincs[lincs$classify == "Extreme" & !is.na(lincs$classify),] @@ -81,33 +76,9 @@ points(log10(lincsGood$trxNRead), lincsGood$trxScore, points(log10(lincsBad$trxNRead), lincsBad$trxScore, pch=21, cex=0.25, col=solarGreen, bg=lighten(solarGreen)) -plotBase("HIV CDS") -lines(predx, predict(hg38_NY5Cutoffs$extremeLoess, predx)) - -hivCds$classify <- hg38_NY5Cutoffs$classify(hivCds$trxNRead, hivCds$trxScore) -hivCdsGood <- hivCds[hivCds$classify != "Extreme" & !is.na(hivCds$classify),] -hivCdsBad <- hivCds[hivCds$classify == "Extreme" & !is.na(hivCds$classify),] - -points(log10(hivCdsGood$trxNRead), hivCdsGood$trxScore - , pch=21, cex=0.75, col=solarRed, bg=lighten(solarRed)) -points(log10(hivCdsBad$trxNRead), hivCdsBad$trxScore - , pch=21, cex=0.75, col=solarCyan, bg=lighten(solarCyan)) - -plotBase("HIV LTR") -lines(predx, predict(hg38_NY5Cutoffs$extremeLoess, predx)) - -hivLtr$classify <- hg38_NY5Cutoffs$classify(hivLtr$trxNRead, hivLtr$trxScore) -hivLtrGood <- hivLtr[hivLtr$classify != "Extreme" & !is.na(hivLtr$classify),] -hivLtrBad <- hivLtr[hivLtr$classify == "Extreme" & !is.na(hivLtr$classify),] - -points(log10(hivLtrGood$trxNRead), hivLtrGood$trxScore - , pch=21, cex=0.75, col=solarRed, bg=lighten(solarRed)) -points(log10(hivLtrBad$trxNRead), hivLtrBad$trxScore - , pch=21, cex=0.75, col=solarCyan, bg=lighten(solarCyan)) - dev.off() -pcoding$classifyCds <- hg38_NY5Cutoffs$classify(pcoding$cdsNRead, pcoding$cdsScore) +pcoding$classifyCds <- Cutoffs$classify(pcoding$cdsNRead, pcoding$cdsScore) table(pcoding[pcoding$cdsNRead > 100, "classifyCds"]) sum(pcoding[pcoding$cdsNRead > 100, "classifyCds"] != "Extreme", na.rm=TRUE) / sum(!is.na(pcoding[pcoding$cdsNRead > 100, "classifyCds"])) @@ -117,19 +88,17 @@ sum(pcoding[pcoding$utr5NRead > 100, "classifyUtr5"] != "Extreme", na.rm=TRUE) / table(lincs[lincs$trxNRead > 100, "classify"]) sum(lincs[lincs$trxNRead > 100, "classify"] != "Extreme", na.rm=TRUE) / sum(!is.na(lincs[lincs$trxNRead > 100, "classify"])) +dir.create("results/tables", showWarnings = FALSE) + write.csv(pcoding[,c("cdsNRead", "cdsScore", "classifyCds")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-refcoding-cds.txt") + "results/tables/floss-refcoding-cds.txt") write.csv(pcoding[,c("utr5NRead", "utr5Score", "classifyUtr5")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-refcoding-utr5.txt") + "results/tables/floss-refcoding-utr5.txt") write.csv(lincs[,c("trxNRead", "trxScore", "classify")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-lincs.txt") + "results/tables/floss-lincs.txt") write.csv(mitoch[,c("cdsNRead", "cdsScore")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-mitoch.txt") + "results/tables/floss-mitoch.txt") write.csv(miscs[,c("trxNRead", "trxScore")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-miscs.txt") -write.csv(hivCds[,c("trxNRead", "trxScore")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-hivCds.txt") -write.csv(hivLtr[,c("trxNRead", "trxScore")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-hivLtr.txt") -write.csv(lincs[,c("ensembl_gene_id", "trxNRead", "trxScore", "classify", "external_gene_id", "external_gene_db", "gene_biotype", "transcript_biotype", "chromosome_name", "start_position", "end_position", "strand", "description")], - "results/10_FLOSS_score/tables/hg38_NY5-floss-lincs-annotated.txt") + "results/tables/floss-miscs.txt") +write.csv(lincs[,c("ensembl_gene_id", "trxNRead", "trxScore", "classify", "external_gene_name", "gene_biotype", "transcript_biotype", "chromosome_name", "start_position", "end_position", "strand", "description")], + "results/tables/floss-lincs-annotated.txt") diff --git a/src/FLOSS_score/human-annotation.R b/src/FLOSS_score/human-annotation.R index cfed5ff5..c40597dc 100644 --- a/src/FLOSS_score/human-annotation.R +++ b/src/FLOSS_score/human-annotation.R @@ -6,6 +6,7 @@ ensembl <- useMart("ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl", host annots <- getBM(attributes=c("ensembl_gene_id", "ensembl_transcript_id", "gene_biotype", "transcript_biotype", "chromosome_name", "start_position", "end_position", "strand", "description", "hgnc_symbol"), mart=ensembl) +colnames(annots)[length(colnames(annots))] <- "external_gene_name" granges <- GRanges(seqnames = annots$chromosome_name, ranges=IRanges(start = annots$start_position, end = annots$end_position), diff --git a/src/FLOSS_score/ldist-calc-mesc.R b/src/FLOSS_score/ldist-calc-mesc.R deleted file mode 100644 index 013a4d47..00000000 --- a/src/FLOSS_score/ldist-calc-mesc.R +++ /dev/null @@ -1,76 +0,0 @@ -library("rtracklayer") -library("Rsamtools") -library("parallel") -library("biomaRt") -library("GenomicRanges") - -setwd("~/GoogleDrive/Post-Doc_ENS/Documents/RMI2/gitlab/HIV_project") -# setwd("/Xnfs/lbmcdb/Ricci_team/HIV_project") -source("./src/FLOSS_score/transcript-utils.R") -source("./src/FLOSS_score/footprint-assignment.R") -source("./src/FLOSS_score/floss.R") - -source("./src/FLOSS_score/human-annotation.R") - - -options(mc.cores=8) - -# Annotation file which in bed format with no reduncance in transcripts -hg38_ny5_BedFile <- "/media/manu/ManuDisque/HIV_project/Ribosome_Profiling/U937_bam/FLOSS/hg38_exon_uniq.bed" -hg38_pnl4.3_BedFile <- "/media/manu/ManuDisque/HIV_project/RibosomeProfiling_2/HEK_IP_FLOSS/hg38_exon_uniq_NC_001802.1.bed" - -## Bamfiles need to be sorted and indexed -## Bamfiles need to contain alignment on human AND viral genome -chxBamFile <- BamFile("/media/manu/ManuDisque/HIV_project/Ribosome_Profiling/U937_bam/FLOSS/U937_CHX_all_hg38_NY5_sort.bam") -chxBamFileIP <- BamFile("/media/manu/ManuDisque/HIV_project/RibosomeProfiling_2/HEK_IP_FLOSS/HEK_IP_HIV1_CHX_rep1_hg38_pNL43.bam") - -aoffsets <- data.frame(asite=c(11,11,11,11,12,12), row.names=seq(26,31)) # offset determined by Ribocode software - -# for U937 - NY5 -hg38_NY5_Bed <- import(hg38_ny5_BedFile, format="BED") -hg38_NY5_Bed$name <- gsub("\\.[[:digit:]]","" ,hg38_NY5_Bed$name ) # remove version of transcript - -hg38_NY5_Size <- regionCountFrame(countSizes(defaultInsets, hg38_NY5_Bed)) -write.csv(hg38_NY5_Size, "results/10_FLOSS_score/hg38_NY5_sizes.txt", row.names=TRUE) - -hg38_NY5_Chx <- regionCountFrame(countAligns(aoffsets, defaultInsets, chxBamFile, hg38_NY5_Bed)) -write.csv(hg38_NY5_Chx, "results/10_FLOSS_score/hg38_NY5_chx.txt", row.names=TRUE) - -chxLDists <- calcTrxRegionLengthDists(chxBamFile, hg38_NY5_Bed) -writeRegionLengthDists(chxLDists, "results/10_FLOSS_score/hg38_NY5_Chx") - -# for HEK - pNL4.3 -> IP - -hg38_pnl4.3_Bed <- import(hg38_pnl4.3_BedFile, format="BED") -hg38_pnl4.3_Bed$name <- gsub("\\.[[:digit:]]","" ,hg38_pnl4.3_Bed$name ) # remove version of transcript - -hg38_NY5_Chx_IP <- regionCountFrame(countAligns(aoffsets, defaultInsets, chxBamFileIP, hg38_pnl4.3_Bed)) -write.csv(hg38_NY5_Chx_IP, "results/10_FLOSS_score/hg38_pNL43_chx_IP.txt", row.names=TRUE) - -chxLDists_IP <- calcTrxRegionLengthDists(chxBamFileIP, hg38_pnl4.3_Bed) -writeRegionLengthDists(chxLDists_IP, "results/10_FLOSS_score/hg38_pNL43_chx_IP") - - - -########### - -chxLDists <- readRegionLengthDists("results/10_FLOSS_score/hg38_NY5_Chx") - - -chxRef <- colSums(chxLDists$cds[refCodingIds,], na.rm=TRUE) -write.csv(chxRef, "results/10_FLOSS_score/hg38_NY5_chx-ref-ldist.txt", row.names=TRUE) - -chxLDFloss <- flossRegionLengthDists( chxLDists, chxRef ) -chxFloss <- flossRegionScores( chxLDFloss ) -write.csv(chxFloss, "results/10_FLOSS_score/hg38_NY5_chx-floss.txt", row.names=TRUE) - -# for HEK - pNL4.3 -> IP - -chxLDists_IP_tralala <- readRegionLengthDists("results/10_FLOSS_score/hg38_pNL43_chx_IP") - -chxRef_IP <- colSums(chxLDists_IP_tralala$cds[refCodingIds,], na.rm=TRUE) -write.csv(chxRef_IP, "results/10_FLOSS_score/hg38_NY5_chx-ref-ldist_IP.txt", row.names=TRUE) - -chxLDFloss_IP <- flossRegionLengthDists( chxLDists_IP_tralala, chxRef_IP ) -chxFloss_IP <- flossRegionScores( chxLDFloss_IP ) -write.csv(chxFloss_IP, "results/10_FLOSS_score/hg38_NY5_chx-floss_IP.txt", row.names=TRUE) diff --git a/src/FLOSS_score/ldist-calc-template.R b/src/FLOSS_score/ldist-calc-template.R new file mode 100644 index 00000000..501dc3f3 --- /dev/null +++ b/src/FLOSS_score/ldist-calc-template.R @@ -0,0 +1,47 @@ +library("rtracklayer") +library("Rsamtools") +library("parallel") +library("biomaRt") +library("GenomicRanges") + +source("./src/FLOSS_score/transcript-utils.R") +source("./src/FLOSS_score/footprint-assignment.R") +source("./src/FLOSS_score/floss.R") + +source("./src/FLOSS_score/mouse-annotation.R") + + +options(mc.cores=8) + +# Annotation file which in bed format with no reduncance in transcripts +BedFile <- "data/wgEncodeGencodeBasicVM24_chr.bed" + +## Bamfiles need to be sorted and indexed +chxBamFile <- BamFile("data/RibosomeProfiling_sort.bam") + +aoffsets <- data.frame(asite=c(11,11,11,11,12,12), row.names=seq(26,31)) # offset determined by Ribocode software + +# for Samples +Bed <- import(BedFile, format="BED") +Bed$name <- gsub("\\.[[:digit:]]","" ,Bed$name ) # remove version of transcript + +Size <- regionCountFrame(countSizes(defaultInsets, Bed)) +write.csv(Size, "results/sizes.txt", row.names=TRUE) + +Chx <- regionCountFrame(countAligns(asiteOffsets = aoffsets, insets = defaultInsets, bamfile = chxBamFile, trxBed = Bed)) +write.csv(Chx, "results/Chx.txt", row.names=TRUE) + +chxLDists <- calcTrxRegionLengthDists(bamfile = chxBamFile, trxBed = Bed) +writeRegionLengthDists(chxLDists, "results/chxLDists") + +########### + +chxLDists <- readRegionLengthDists("results/chxLDists") + + +chxRef <- colSums(chxLDists$cds[refCodingIds,], na.rm = TRUE) +write.csv(chxRef, "results/chx-ref-ldist.txt", row.names = TRUE) + +chxLDFloss <- flossRegionLengthDists( chxLDists, chxRef ) +chxFloss <- flossRegionScores( chxLDFloss ) +write.csv(chxFloss, "results/chx-floss.txt", row.names=TRUE) diff --git a/src/FLOSS_score/ldist-plots-mesc.R b/src/FLOSS_score/ldist-plots-mesc.R deleted file mode 100644 index 5a521eae..00000000 --- a/src/FLOSS_score/ldist-plots-mesc.R +++ /dev/null @@ -1,281 +0,0 @@ -library("biomaRt") -library("GenomicRanges") -library("boot") - -setwd("~/GoogleDrive/Post-Doc_ENS/Documents/RMI2/gitlab/HIV_project") -#setwd("/Xnfs/lbmcdb/Ricci_team/HIV_project") - -source("./src/FLOSS_score/transcript-utils.R") -source("./src/FLOSS_score/footprint-assignment.R") -source("./src/FLOSS_score/floss.R") -source("./src/FLOSS_score/plot-utils.R") - -source("./src/FLOSS_score/human-annotation.R") - -cdist <- function(ldists) { normLDist(colSums(ldists, na.rm=TRUE)) } - -lengthLevels <- seq(defaultLengthMin, defaultLengthMax) - -hg38_NY5RefLDist <- normLDist(read.csv("results/10_FLOSS_score/hg38_NY5_chx-ref-ldist.txt", check.names=F, row.names=1)[,1]) -hg38_NY5LDists <- readRegionLengthDists("results/10_FLOSS_score/hg38_NY5_Chx") - -hg38_NY5RefCodingCdses <- hg38_NY5LDists$cds[row.names(hg38_NY5LDists$cds) %in% refCodingIds,] -hg38_NY5RefCodingUtr5s <- hg38_NY5LDists$utr5[row.names(hg38_NY5LDists$utr5) %in% refCodingIds,] -hg38_NY5Lincs <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% cleanLincIds,] -hg38_NY5MitoCdses <- hg38_NY5LDists$cds[row.names(hg38_NY5LDists$cds) %in% mitoCodingIds,] - -vault <- annots[annots$hgnc_symbol == "VTRNA1-1", "ensembl_transcript_id"] -terc <- annots[annots$hgnc_symbol == "TERC", "ensembl_transcript_id"] -snhg5 <- annots[annots$hgnc_symbol == "SNHG5", "ensembl_transcript_id"] - -hg38_NY5Vault <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% vault,] -hg38_NY5Terc <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% terc,] -hg38_NY5Snhg5 <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% snhg5,] -hiv_genes <- c("GAG","POL","VIF", "VPU", "VPR", "ENV", "REV", "VIF", "TAT", "NEF") -hg38_NY5HIVCDS <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% hiv_genes,] -hg38_NY5HIVLTR <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% "LTR",] - -hg38_NY5CdsDist <- cdist(hg38_NY5RefCodingCdses) - -pdf("./results/10_FLOSS_score/hg38_NY5-examples-rplot.pdf") -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.6), xlab="Fragment Length", main = "LincRNA") -lines(lengthLevels, cdist(hg38_NY5Lincs), type="l", col=solarMagenta, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.5), xlab="Fragment Length", main = "5' UTR") -lines(lengthLevels, cdist(hg38_NY5RefCodingUtr5s), type="l", col=solarRed, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.5), xlab="Fragment Length", main = "Mitochondrial genes") -lines(lengthLevels, cdist(hg38_NY5MitoCdses), type="l", col=solarCyan, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.5), xlab="Fragment Length", main = "Vault") -lines(lengthLevels, cdist(hg38_NY5Vault), - type="l", col=solarYellow, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.5), xlab="Fragment Length", main = "Terc") -lines(lengthLevels, cdist(hg38_NY5Terc), - type="l", col=solarYellow, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.35), xlab="Fragment Length", main = "Snhg5") -lines(lengthLevels, cdist(hg38_NY5Snhg5), - type="l", col=solarGreen, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.35), xlab="Fragment Length", main = "HIV CDS") -lines(lengthLevels, cdist(hg38_NY5HIVCDS), - type="l", col=solarBlue, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.35), xlab="Fragment Length", main = "HIV LTR") -lines(lengthLevels, cdist(hg38_NY5HIVLTR), - type="l", col=solarOrange, lwd=2) - -dev.off() - -# Write data tables -write.csv(hg38_NY5RefCodingCdses, "results/10_FLOSS_score/tables/ldist-hg38_NY5-refcoding-cds.txt") -write.csv(hg38_NY5Terc, "results/10_FLOSS_score/tables/ldist-hg38_NY5-terc-trx.txt") -write.csv(hg38_NY5Vault, "results/10_FLOSS_score/tables/ldist-hg38_NY5-vault-trx.txt") -write.csv(hg38_NY5MitoCdses, "results/10_FLOSS_score/tables/ldist-hg38_NY5-mito-cds.txt") -write.csv(hg38_NY5Snhg5, "results/10_FLOSS_score/tables/ldist-hg38_NY5-snhg5-trx.txt") -write.csv(hg38_NY5Lincs, "results/10_FLOSS_score/tables/ldist-hg38_NY5-linc-trx.txt") -write.csv(hg38_NY5RefCodingUtr5s, "results/10_FLOSS_score/tables/ldist-hg38_NY5-refcoding-utr5.txt") -write.csv(hg38_NY5HIVCDS, "results/10_FLOSS_score/tables/ldist-hg38_NY5-hiv-cds.txt") -write.csv(hg38_NY5HIVLTR, "results/10_FLOSS_score/tables/ldist-hg38_NY5-hiv-ltr.txt") - -write.csv(rbind(refCodingCdses = cdist(hg38_NY5RefCodingCdses), - cleanLincs = cdist(hg38_NY5Lincs), - refCodingUtr5s = cdist(hg38_NY5RefCodingUtr5s), - terc = cdist(hg38_NY5Terc), - vault = cdist(hg38_NY5Vault), - snhg5 = cdist(hg38_NY5Snhg5), - hivCds = cdist(hg38_NY5HIVCDS), - hivLtr = cdist(hg38_NY5HIVLTR)), - "results/FLOSS_score/tables/ldist-hg38_NY5.txt") - -## STOP HERE ## -## I'm not sure that I need the following code - - -# # Need FLOSS to pick out real lincRNAs. -# emetFloss <- read.csv("results/FLOSS_score/mesc-chx-floss.txt", check.names=FALSE, row.names=1) -# emetCutoffs <- flossCutoffs(emetFloss[refCodingIds,]$cdsNRead, emetFloss[refCodingIds,]$cdsScore) -# emetLincFloss <- emetFloss[cleanLincIds,] -# emetLincFloss$classify <- emetCutoffs$classify(emetLincFloss$trxNRead, emetLincFloss$trxScore) -# flossLincIds <- row.names(emetLincFloss[emetLincFloss$classify != "Extreme" & !is.na(emetLincFloss$classify),]) -# -# mescChxCdsLDist <- cdist(mescChxLDists$cds[refCodingIds,]) -# mescEmetCdsLDist <- cdist(mescEmetLDists$cds[refCodingIds,]) -# -# mescChxUtr5LDist <- cdist(mescChxLDists$utr5[refCodingIds,]) -# mescEmetUtr5LDist <- cdist(mescEmetLDists$utr5[refCodingIds,]) -# -# mescChxLincLDist <- cdist(mescChxLDists$trx[flossLincIds,]) -# mescEmetLincLDist <- cdist(mescEmetLDists$trx[flossLincIds,]) -# -# mescChxMitoLDist <- cdist(mescChxLDists$cds[mitoCodingIds,]) -# mescEmetMitoLDist <- cdist(mescEmetLDists$cds[mitoCodingIds,]) -# -# yeastChxLDists <- readLengthDists("data/yeast-spike-chx-trx") -# yeastEmetLDists <- readLengthDists("data/yeast-spike-emet-trx") -# -# yeastChxLDist <- cdist(yeastChxLDists) -# yeastEmetLDist <- cdist(yeastEmetLDists) -# -# pdf("length-shift-cumul-rplot.pdf") -# plot(lengthLevels, mescChxCdsLDist, type="l", cex=2, col=solarOrange, xlim=c(26,34), ylim=c(0,0.3), lwd=2) -# lines(lengthLevels, mescEmetCdsLDist, cex=2, col=solarBlue, lwd=2) -# -# plot(lengthLevels, cumsum(mescChxCdsLDist), type="l", cex=2, col=solarOrange, xlim=c(26,34), ylim=c(0,1.0), lwd=2) -# lines(lengthLevels, cumsum(mescEmetCdsLDist), cex=2, col=solarBlue, lwd=2) -# -# plotFour <- function( chx, emet, name ) { -# plot(lengthLevels, chx, type="l", cex=2, col=solarYellow, xlim=c(26,34), ylim=c(0,0.3), lwd=2, main=name) -# lines(lengthLevels, emet, cex=2, col=solarCyan, lwd=2) -# lines(lengthLevels, mescChxCdsLDist, cex=2, col=lighten(solarOrange, lfact=0.4), lwd=2) -# lines(lengthLevels, mescEmetCdsLDist, cex=2, col=lighten(solarBlue, lfact=0.4), lwd=2) -# -# plot(lengthLevels, cumsum(chx), type="l", cex=2, col=solarYellow, xlim=c(26,34), ylim=c(0,1.0), lwd=2, main=name) -# lines(lengthLevels, cumsum(emet), cex=2, col=solarCyan, lwd=2) -# lines(lengthLevels, cumsum(mescChxCdsLDist), cex=2, col=lighten(solarOrange, lfact=0.4), lwd=2) -# lines(lengthLevels, cumsum(mescEmetCdsLDist), cex=2, col=lighten(solarBlue, lfact=0.4), lwd=2) -# -# write.csv(rbind(chxCodingCds = cumsum(mescChxCdsLDist), -# emetCodingCds = cumsum(mescEmetCdsLDist), -# chx = cumsum(chx), -# emet = cumsum(emet)), -# sprintf("tables/lshift-%s.txt", name)) -# } -# -# plotFour( mescChxUtr5LDist, mescEmetUtr5LDist, "5' UTR" ) -# plotFour( mescChxLincLDist, mescEmetLincLDist, "All lincRNAs" ) -# plotFour( yeastChxLDist, yeastEmetLDist, "Yeast" ) -# -# plotFour(cdist(mescChxLDists$trx[terc,]), -# cdist(mescEmetLDists$trx[terc,]), "Terc") -# -# plotFour(cdist(mescChxLDists$trx[vault,]), -# cdist(mescEmetLDists$trx[vault,]), "Vaultrc5") -# -# plotFour(cdist(mescChxLDists$trx[snhg5,]), -# cdist(mescEmetLDists$trx[snhg5,]), "Snhg5") -# -# dev.off() -# -# shiftScore <- function(ldShort, ldLong) { -# cldShort <- cumsum(ldShort) -# cldLong <- cumsum(ldLong) -# sum(cldShort - cldLong) -# } -# -# ldistShiftStat <- function(ldists, sampleIndex) { -# lastChx <- floor(length(sampleIndex) / 2) -# -# chxLD <- cdist(ldists[sampleIndex[1:lastChx],]) -# names(chxLD) <- sub("^", "chx", names(chxLD)) -# -# emetLD <- cdist(ldists[sampleIndex[(lastChx+1):(nrow(ldists))],]) -# names(emetLD) <- sub("^", "emet", names(emetLD)) -# -# v <- append(chxLD, emetLD) -# v <- append(v, ldistDiff(chxLD, emetLD)) -# v <- append(v, shiftScore(chxLD, emetLD)) -# v -# } -# -# cdsShiftBoot <- boot(data = rbind(mescChxLDists$cds[refCodingIds,], -# mescEmetLDists$cds[refCodingIds,]), -# statistic = ldistShiftStat, -# R = 10000, -# parallel = "multicore", ncpus=32) -# -# utr5ShiftBoot <- boot(data = rbind(mescChxLDists$utr5[refCodingIds,], -# mescEmetLDists$utr5[refCodingIds,]), -# statistic = ldistShiftStat, -# R = 10000, -# parallel = "multicore", ncpus=32) -# -# lincShiftBoot <- boot(data = rbind(mescChxLDists$trx[cleanLincIds,], -# mescEmetLDists$trx[cleanLincIds,]), -# statistic = ldistShiftStat, -# R = 10000, -# parallel = "multicore", ncpus=32) -# -# yeastShiftBoot <- boot(data = rbind(yeastChxLDists, yeastEmetLDists), -# statistic = ldistShiftStat, -# R = 10000, -# parallel = "multicore", ncpus=32) -# -# capture.output(summary(cdsShiftBoot$t[,20]), file="data/mesc-bootstrap-shift.txt") -# capture.output(cdsShiftBoot$t0[20], file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(cdsShiftBoot$t[,20]) >= abs(cdsShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(cdsShiftBoot$t[,20]) < abs(cdsShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# -# capture.output(summary(utr5ShiftBoot$t[,20]), file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(utr5ShiftBoot$t0[20], file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(utr5ShiftBoot$t[,20]) >= abs(utr5ShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(utr5ShiftBoot$t[,20]) < abs(utr5ShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# -# capture.output(summary(lincShiftBoot$t[,20]), file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(lincShiftBoot$t0[20], file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(lincShiftBoot$t[,20]) >= abs(lincShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(lincShiftBoot$t[,20]) < abs(lincShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# -# capture.output(summary(yeastShiftBoot$t[,20]), file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(yeastShiftBoot$t0[20], file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(yeastShiftBoot$t[,20]) >= abs(yeastShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(yeastShiftBoot$t[,20]) < abs(yeastShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# -# pdf("ldist-shift-bootstrap-plot.pdf") -# -# bootplot <- function(b, t) { -# hist(b$t[,20], xlim=c(-1,1)) -# abline(v = b$t0[20]) -# title(main = t) -# -# boxplot(b$t[,20], ylim=c(-1,1), -# pars=list(outpch=21, outcex=0.5, -# boxwex=0.2, boxlty = "blank", boxfill=solarGrey01, medlwd = 0.33, medcol = "white" -# )) -# points(c(1), c(b$t0[20]), pch=21, col=solarRed, bg=lighten(solarRed)) -# axis(side=2, at=seq(-1,1,0.1), labels = FALSE) -# title(main = t) -# } -# -# plot(cdsShiftBoot, index=20) -# plot(utr5ShiftBoot, index=20) -# plot(lincShiftBoot, index=20) -# plot(yeastShiftBoot, index=20) -# -# bootplot(cdsShiftBoot, "Coding CDS") -# bootplot(utr5ShiftBoot, "Coding 5 UTR") -# bootplot(lincShiftBoot, "linc") -# bootplot(yeastShiftBoot, "Yeast") -# -# boxplot(cdsShiftBoot$t[,20], utr5ShiftBoot$t[,20], -# lincShiftBoot$t[,20], yeastShiftBoot$t[,20], -# ylim=c(-1,1), -# names=c("CDS", "Utr5", "linc", "yeast"), -# pars=list(outpch=21, outcex=0.5, -# boxwex=0.4, boxlty = "blank", boxfill=solarGrey01, medlwd = 0.33, medcol = "white" -# )) -# points(seq(1,4), -# c(cdsShiftBoot$t0[20], utr5ShiftBoot$t0[20], -# lincShiftBoot$t0[20], yeastShiftBoot$t0[20]), -# pch=21, col=solarRed, bg=lighten(solarRed)) -# axis(side=2, at=seq(-1,1,0.1), labels = FALSE) -# -# dev.off() diff --git a/src/FLOSS_score/ldist-plots-mesc_ip.R b/src/FLOSS_score/ldist-plots-mesc_ip.R deleted file mode 100644 index 373c8822..00000000 --- a/src/FLOSS_score/ldist-plots-mesc_ip.R +++ /dev/null @@ -1,281 +0,0 @@ -library("biomaRt") -library("GenomicRanges") -library("boot") - -setwd("~/GoogleDrive/Post-Doc_ENS/Documents/RMI2/gitlab/HIV_project") -#setwd("/Xnfs/lbmcdb/Ricci_team/HIV_project") - -source("./src/FLOSS_score/transcript-utils.R") -source("./src/FLOSS_score/footprint-assignment.R") -source("./src/FLOSS_score/floss.R") -source("./src/FLOSS_score/plot-utils.R") - -source("./src/FLOSS_score/human-annotation.R") - -cdist <- function(ldists) { normLDist(colSums(ldists, na.rm=TRUE)) } - -lengthLevels <- seq(defaultLengthMin, defaultLengthMax) - -hg38_NY5RefLDist <- normLDist(read.csv("results/10_FLOSS_score/hg38_NY5_chx-ref-ldist_IP.txt", check.names=F, row.names=1)[,1]) -hg38_NY5LDists <- readRegionLengthDists("results/10_FLOSS_score/hg38_pNL43_chx_IP") - -hg38_NY5RefCodingCdses <- hg38_NY5LDists$cds[row.names(hg38_NY5LDists$cds) %in% refCodingIds,] -hg38_NY5RefCodingUtr5s <- hg38_NY5LDists$utr5[row.names(hg38_NY5LDists$utr5) %in% refCodingIds,] -hg38_NY5Lincs <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% cleanLincIds,] -hg38_NY5MitoCdses <- hg38_NY5LDists$cds[row.names(hg38_NY5LDists$cds) %in% mitoCodingIds,] - -vault <- annots[annots$hgnc_symbol == "VTRNA1-1", "ensembl_transcript_id"] -terc <- annots[annots$hgnc_symbol == "TERC", "ensembl_transcript_id"] -snhg5 <- annots[annots$hgnc_symbol == "SNHG5", "ensembl_transcript_id"] - -hg38_NY5Vault <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% vault,] -hg38_NY5Terc <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% terc,] -hg38_NY5Snhg5 <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% snhg5,] -hiv_genes <- c("GAG","POL","VIF", "VPU", "VPR", "ENV", "REV", "VIF", "TAT", "NEF") -hg38_NY5HIVCDS <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% hiv_genes,] -hg38_NY5HIVLTR <- hg38_NY5LDists$trx[row.names(hg38_NY5LDists$trx) %in% "LTR",] - -hg38_NY5CdsDist <- cdist(hg38_NY5RefCodingCdses) - -pdf("./results/10_FLOSS_score/hg38_pNL43-examples-rplot_IP.pdf") -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.6), xlab="Fragment Length", main = "LincRNA") -lines(lengthLevels, cdist(hg38_NY5Lincs), type="l", col=solarMagenta, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.5), xlab="Fragment Length", main = "5' UTR") -lines(lengthLevels, cdist(hg38_NY5RefCodingUtr5s), type="l", col=solarRed, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.5), xlab="Fragment Length", main = "Mitochondrial genes") -lines(lengthLevels, cdist(hg38_NY5MitoCdses), type="l", col=solarCyan, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.5), xlab="Fragment Length", main = "Vault") -lines(lengthLevels, cdist(hg38_NY5Vault), - type="l", col=solarYellow, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.5), xlab="Fragment Length", main = "Terc") -lines(lengthLevels, cdist(hg38_NY5Terc), - type="l", col=solarYellow, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.35), xlab="Fragment Length", main = "Snhg5") -lines(lengthLevels, cdist(hg38_NY5Snhg5), - type="l", col=solarGreen, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.35), xlab="Fragment Length", main = "HIV CDS") -lines(lengthLevels, cdist(hg38_NY5HIVCDS), - type="l", col=solarBlue, lwd=2) - -plot(lengthLevels, hg38_NY5CdsDist, type="l", col=lighten(solarGrey01), lwd=4, - xlim=c(26,34), ylim=c(0,0.35), xlab="Fragment Length", main = "HIV LTR") -lines(lengthLevels, cdist(hg38_NY5HIVLTR), - type="l", col=solarOrange, lwd=2) - -dev.off() - -# Write data tables -write.csv(hg38_NY5RefCodingCdses, "results/10_FLOSS_score/tables/ldist-hg38_NY5-refcoding-cds.txt") -write.csv(hg38_NY5Terc, "results/10_FLOSS_score/tables/ldist-hg38_NY5-terc-trx.txt") -write.csv(hg38_NY5Vault, "results/10_FLOSS_score/tables/ldist-hg38_NY5-vault-trx.txt") -write.csv(hg38_NY5MitoCdses, "results/10_FLOSS_score/tables/ldist-hg38_NY5-mito-cds.txt") -write.csv(hg38_NY5Snhg5, "results/10_FLOSS_score/tables/ldist-hg38_NY5-snhg5-trx.txt") -write.csv(hg38_NY5Lincs, "results/10_FLOSS_score/tables/ldist-hg38_NY5-linc-trx.txt") -write.csv(hg38_NY5RefCodingUtr5s, "results/10_FLOSS_score/tables/ldist-hg38_NY5-refcoding-utr5.txt") -write.csv(hg38_NY5HIVCDS, "results/10_FLOSS_score/tables/ldist-hg38_NY5-hiv-cds.txt") -write.csv(hg38_NY5HIVLTR, "results/10_FLOSS_score/tables/ldist-hg38_NY5-hiv-ltr.txt") - -write.csv(rbind(refCodingCdses = cdist(hg38_NY5RefCodingCdses), - cleanLincs = cdist(hg38_NY5Lincs), - refCodingUtr5s = cdist(hg38_NY5RefCodingUtr5s), - terc = cdist(hg38_NY5Terc), - vault = cdist(hg38_NY5Vault), - snhg5 = cdist(hg38_NY5Snhg5), - hivCds = cdist(hg38_NY5HIVCDS), - hivLtr = cdist(hg38_NY5HIVLTR)), - "results/FLOSS_score/tables/ldist-hg38_NY5.txt") - -## STOP HERE ## -## I'm not sure that I need the following code - - -# # Need FLOSS to pick out real lincRNAs. -# emetFloss <- read.csv("results/FLOSS_score/mesc-chx-floss.txt", check.names=FALSE, row.names=1) -# emetCutoffs <- flossCutoffs(emetFloss[refCodingIds,]$cdsNRead, emetFloss[refCodingIds,]$cdsScore) -# emetLincFloss <- emetFloss[cleanLincIds,] -# emetLincFloss$classify <- emetCutoffs$classify(emetLincFloss$trxNRead, emetLincFloss$trxScore) -# flossLincIds <- row.names(emetLincFloss[emetLincFloss$classify != "Extreme" & !is.na(emetLincFloss$classify),]) -# -# mescChxCdsLDist <- cdist(mescChxLDists$cds[refCodingIds,]) -# mescEmetCdsLDist <- cdist(mescEmetLDists$cds[refCodingIds,]) -# -# mescChxUtr5LDist <- cdist(mescChxLDists$utr5[refCodingIds,]) -# mescEmetUtr5LDist <- cdist(mescEmetLDists$utr5[refCodingIds,]) -# -# mescChxLincLDist <- cdist(mescChxLDists$trx[flossLincIds,]) -# mescEmetLincLDist <- cdist(mescEmetLDists$trx[flossLincIds,]) -# -# mescChxMitoLDist <- cdist(mescChxLDists$cds[mitoCodingIds,]) -# mescEmetMitoLDist <- cdist(mescEmetLDists$cds[mitoCodingIds,]) -# -# yeastChxLDists <- readLengthDists("data/yeast-spike-chx-trx") -# yeastEmetLDists <- readLengthDists("data/yeast-spike-emet-trx") -# -# yeastChxLDist <- cdist(yeastChxLDists) -# yeastEmetLDist <- cdist(yeastEmetLDists) -# -# pdf("length-shift-cumul-rplot.pdf") -# plot(lengthLevels, mescChxCdsLDist, type="l", cex=2, col=solarOrange, xlim=c(26,34), ylim=c(0,0.3), lwd=2) -# lines(lengthLevels, mescEmetCdsLDist, cex=2, col=solarBlue, lwd=2) -# -# plot(lengthLevels, cumsum(mescChxCdsLDist), type="l", cex=2, col=solarOrange, xlim=c(26,34), ylim=c(0,1.0), lwd=2) -# lines(lengthLevels, cumsum(mescEmetCdsLDist), cex=2, col=solarBlue, lwd=2) -# -# plotFour <- function( chx, emet, name ) { -# plot(lengthLevels, chx, type="l", cex=2, col=solarYellow, xlim=c(26,34), ylim=c(0,0.3), lwd=2, main=name) -# lines(lengthLevels, emet, cex=2, col=solarCyan, lwd=2) -# lines(lengthLevels, mescChxCdsLDist, cex=2, col=lighten(solarOrange, lfact=0.4), lwd=2) -# lines(lengthLevels, mescEmetCdsLDist, cex=2, col=lighten(solarBlue, lfact=0.4), lwd=2) -# -# plot(lengthLevels, cumsum(chx), type="l", cex=2, col=solarYellow, xlim=c(26,34), ylim=c(0,1.0), lwd=2, main=name) -# lines(lengthLevels, cumsum(emet), cex=2, col=solarCyan, lwd=2) -# lines(lengthLevels, cumsum(mescChxCdsLDist), cex=2, col=lighten(solarOrange, lfact=0.4), lwd=2) -# lines(lengthLevels, cumsum(mescEmetCdsLDist), cex=2, col=lighten(solarBlue, lfact=0.4), lwd=2) -# -# write.csv(rbind(chxCodingCds = cumsum(mescChxCdsLDist), -# emetCodingCds = cumsum(mescEmetCdsLDist), -# chx = cumsum(chx), -# emet = cumsum(emet)), -# sprintf("tables/lshift-%s.txt", name)) -# } -# -# plotFour( mescChxUtr5LDist, mescEmetUtr5LDist, "5' UTR" ) -# plotFour( mescChxLincLDist, mescEmetLincLDist, "All lincRNAs" ) -# plotFour( yeastChxLDist, yeastEmetLDist, "Yeast" ) -# -# plotFour(cdist(mescChxLDists$trx[terc,]), -# cdist(mescEmetLDists$trx[terc,]), "Terc") -# -# plotFour(cdist(mescChxLDists$trx[vault,]), -# cdist(mescEmetLDists$trx[vault,]), "Vaultrc5") -# -# plotFour(cdist(mescChxLDists$trx[snhg5,]), -# cdist(mescEmetLDists$trx[snhg5,]), "Snhg5") -# -# dev.off() -# -# shiftScore <- function(ldShort, ldLong) { -# cldShort <- cumsum(ldShort) -# cldLong <- cumsum(ldLong) -# sum(cldShort - cldLong) -# } -# -# ldistShiftStat <- function(ldists, sampleIndex) { -# lastChx <- floor(length(sampleIndex) / 2) -# -# chxLD <- cdist(ldists[sampleIndex[1:lastChx],]) -# names(chxLD) <- sub("^", "chx", names(chxLD)) -# -# emetLD <- cdist(ldists[sampleIndex[(lastChx+1):(nrow(ldists))],]) -# names(emetLD) <- sub("^", "emet", names(emetLD)) -# -# v <- append(chxLD, emetLD) -# v <- append(v, ldistDiff(chxLD, emetLD)) -# v <- append(v, shiftScore(chxLD, emetLD)) -# v -# } -# -# cdsShiftBoot <- boot(data = rbind(mescChxLDists$cds[refCodingIds,], -# mescEmetLDists$cds[refCodingIds,]), -# statistic = ldistShiftStat, -# R = 10000, -# parallel = "multicore", ncpus=32) -# -# utr5ShiftBoot <- boot(data = rbind(mescChxLDists$utr5[refCodingIds,], -# mescEmetLDists$utr5[refCodingIds,]), -# statistic = ldistShiftStat, -# R = 10000, -# parallel = "multicore", ncpus=32) -# -# lincShiftBoot <- boot(data = rbind(mescChxLDists$trx[cleanLincIds,], -# mescEmetLDists$trx[cleanLincIds,]), -# statistic = ldistShiftStat, -# R = 10000, -# parallel = "multicore", ncpus=32) -# -# yeastShiftBoot <- boot(data = rbind(yeastChxLDists, yeastEmetLDists), -# statistic = ldistShiftStat, -# R = 10000, -# parallel = "multicore", ncpus=32) -# -# capture.output(summary(cdsShiftBoot$t[,20]), file="data/mesc-bootstrap-shift.txt") -# capture.output(cdsShiftBoot$t0[20], file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(cdsShiftBoot$t[,20]) >= abs(cdsShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(cdsShiftBoot$t[,20]) < abs(cdsShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# -# capture.output(summary(utr5ShiftBoot$t[,20]), file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(utr5ShiftBoot$t0[20], file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(utr5ShiftBoot$t[,20]) >= abs(utr5ShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(utr5ShiftBoot$t[,20]) < abs(utr5ShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# -# capture.output(summary(lincShiftBoot$t[,20]), file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(lincShiftBoot$t0[20], file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(lincShiftBoot$t[,20]) >= abs(lincShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(lincShiftBoot$t[,20]) < abs(lincShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# -# capture.output(summary(yeastShiftBoot$t[,20]), file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(yeastShiftBoot$t0[20], file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(yeastShiftBoot$t[,20]) >= abs(yeastShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# capture.output(sum(abs(yeastShiftBoot$t[,20]) < abs(yeastShiftBoot$t0[20])), -# file="data/mesc-bootstrap-shift.txt", append=TRUE) -# -# pdf("ldist-shift-bootstrap-plot.pdf") -# -# bootplot <- function(b, t) { -# hist(b$t[,20], xlim=c(-1,1)) -# abline(v = b$t0[20]) -# title(main = t) -# -# boxplot(b$t[,20], ylim=c(-1,1), -# pars=list(outpch=21, outcex=0.5, -# boxwex=0.2, boxlty = "blank", boxfill=solarGrey01, medlwd = 0.33, medcol = "white" -# )) -# points(c(1), c(b$t0[20]), pch=21, col=solarRed, bg=lighten(solarRed)) -# axis(side=2, at=seq(-1,1,0.1), labels = FALSE) -# title(main = t) -# } -# -# plot(cdsShiftBoot, index=20) -# plot(utr5ShiftBoot, index=20) -# plot(lincShiftBoot, index=20) -# plot(yeastShiftBoot, index=20) -# -# bootplot(cdsShiftBoot, "Coding CDS") -# bootplot(utr5ShiftBoot, "Coding 5 UTR") -# bootplot(lincShiftBoot, "linc") -# bootplot(yeastShiftBoot, "Yeast") -# -# boxplot(cdsShiftBoot$t[,20], utr5ShiftBoot$t[,20], -# lincShiftBoot$t[,20], yeastShiftBoot$t[,20], -# ylim=c(-1,1), -# names=c("CDS", "Utr5", "linc", "yeast"), -# pars=list(outpch=21, outcex=0.5, -# boxwex=0.4, boxlty = "blank", boxfill=solarGrey01, medlwd = 0.33, medcol = "white" -# )) -# points(seq(1,4), -# c(cdsShiftBoot$t0[20], utr5ShiftBoot$t0[20], -# lincShiftBoot$t0[20], yeastShiftBoot$t0[20]), -# pch=21, col=solarRed, bg=lighten(solarRed)) -# axis(side=2, at=seq(-1,1,0.1), labels = FALSE) -# -# dev.off() diff --git a/src/FLOSS_score/ldist-plots-template.R b/src/FLOSS_score/ldist-plots-template.R new file mode 100644 index 00000000..48b421d7 --- /dev/null +++ b/src/FLOSS_score/ldist-plots-template.R @@ -0,0 +1,60 @@ +library("biomaRt") +library("GenomicRanges") +library("boot") + +source("./src/FLOSS_score/transcript-utils.R") +source("./src/FLOSS_score/footprint-assignment.R") +source("./src/FLOSS_score/floss.R") +source("./src/FLOSS_score/plot-utils.R") + +source("./src/FLOSS_score/mouse-annotation.R") + +cdist <- function(ldists) { normLDist(colSums(ldists, na.rm=TRUE)) } + +lengthLevels <- seq(defaultLengthMin, defaultLengthMax) + +RefLDist <- normLDist(read.csv("results/chx-ref-ldist.txt", check.names=F, row.names=1)[,1]) +chxLDists <- readRegionLengthDists("results/chxLDists") +rownames(chxLDists$cds) <- gsub("\\.[[:digit:]]*","" ,rownames(chxLDists$cds) ) +rownames(chxLDists$utr5) <- gsub("\\.[[:digit:]]*","" ,rownames(chxLDists$utr5) ) +rownames(chxLDists$utr3) <- gsub("\\.[[:digit:]]*","" ,rownames(chxLDists$utr3) ) +rownames(chxLDists$trx) <- gsub("\\.[[:digit:]]*","" ,rownames(chxLDists$trx) ) + +LDistCdses <- chxLDists$cds[rownames(chxLDists$cds) %in% refCodingIds,] +LDistUtr5s <- chxLDists$utr5[row.names(chxLDists$utr5) %in% refCodingIds,] +LDistLincs <- chxLDists$trx[row.names(chxLDists$trx) %in% cleanLincIds,] +LDistCdses <- chxLDists$cds[row.names(chxLDists$cds) %in% mitoCodingIds,] + +CdsDist <- cdist(LDistCdses) + +pdf("./results/examples-rplot.pdf") +plot(lengthLevels, CdsDist, type="l", col=lighten(solarGrey01), lwd=4, + xlim=c(26,34), ylim=c(0,0.6), xlab="Fragment Length", main = "LincRNA") +lines(lengthLevels, cdist(LDistLincs), type="l", col=solarMagenta, lwd=2) + +plot(lengthLevels, CdsDist, type="l", col=lighten(solarGrey01), lwd=4, + xlim=c(26,34), ylim=c(0,0.5), xlab="Fragment Length", main = "5' UTR") +lines(lengthLevels, cdist(LDistUtr5s), type="l", col=solarRed, lwd=2) + +dev.off() + +# Write data tables +# write.csv(hg38_NY5RefCodingCdses, "results/10_FLOSS_score/tables/ldist-hg38_NY5-refcoding-cds.txt") +# write.csv(hg38_NY5Terc, "results/10_FLOSS_score/tables/ldist-hg38_NY5-terc-trx.txt") +# write.csv(hg38_NY5Vault, "results/10_FLOSS_score/tables/ldist-hg38_NY5-vault-trx.txt") +# write.csv(hg38_NY5MitoCdses, "results/10_FLOSS_score/tables/ldist-hg38_NY5-mito-cds.txt") +# write.csv(hg38_NY5Snhg5, "results/10_FLOSS_score/tables/ldist-hg38_NY5-snhg5-trx.txt") +# write.csv(Lincs, "results/10_FLOSS_score/tables/ldist-hg38_NY5-linc-trx.txt") +# write.csv(hg38_NY5RefCodingUtr5s, "results/10_FLOSS_score/tables/ldist-hg38_NY5-refcoding-utr5.txt") +# write.csv(hg38_NY5HIVCDS, "results/10_FLOSS_score/tables/ldist-hg38_NY5-hiv-cds.txt") +# write.csv(hg38_NY5HIVLTR, "results/10_FLOSS_score/tables/ldist-hg38_NY5-hiv-ltr.txt") +# +# write.csv(rbind(refCodingCdses = cdist(hg38_NY5RefCodingCdses), +# cleanLincs = cdist(hg38_NY5Lincs), +# refCodingUtr5s = cdist(hg38_NY5RefCodingUtr5s), +# terc = cdist(hg38_NY5Terc), +# vault = cdist(hg38_NY5Vault), +# snhg5 = cdist(hg38_NY5Snhg5), +# hivCds = cdist(hg38_NY5HIVCDS), +# hivLtr = cdist(hg38_NY5HIVLTR)), +# "results/FLOSS_score/tables/ldist-hg38_NY5.txt") diff --git a/src/FLOSS_score/mouse-annotation.R b/src/FLOSS_score/mouse-annotation.R index 26d22312..0d75fdd2 100644 --- a/src/FLOSS_score/mouse-annotation.R +++ b/src/FLOSS_score/mouse-annotation.R @@ -4,7 +4,7 @@ library("biomaRt") ensembl <- useMart("ENSEMBL_MART_ENSEMBL", dataset="mmusculus_gene_ensembl", host="uswest.ensembl.org") annots <- getBM(attributes=c("ensembl_gene_id", "ensembl_transcript_id", "gene_biotype", "transcript_biotype", - "chromosome_name", "start_position", "end_position", "strand", "description", "external_gene_id", "external_gene_db"), + "chromosome_name", "start_position", "end_position", "strand", "description", "external_gene_name"), mart=ensembl) granges <- GRanges(seqnames = annots$chromosome_name, diff --git a/src/FLOSS_score/orf-org-plot.R b/src/FLOSS_score/orf-org-plot.R deleted file mode 100644 index 5d1e53e8..00000000 --- a/src/FLOSS_score/orf-org-plot.R +++ /dev/null @@ -1,117 +0,0 @@ -source("../shared/plot-utils.R") - -allLincTable <- read.csv("data/mesc-lincs-aug-analysis.txt") -lincTable <- allLincTable[!is.na(allLincTable$bestAugHarr),] - -allPcodingTable <- read.csv("data/mesc-pcoding-aug-analysis.txt") -pcodingTable <- allPcodingTable[!is.na(allPcodingTable$bestAugHarr),] - -pdf("mesc-lincs-aug-analysis.pdf", width = 6, height = 3, useDingbats = FALSE) - -bestAugCodonRankHist <- hist(lincTable$bestAugCodonRank, breaks=c(1:12), plot=FALSE) -barplot(bestAugCodonRankHist$counts, ylim=c(0,80), - main = "Rank of harringonine peak at AUG start (versus all codons)", - ylab = "Number of transcripts", - names.arg = c(1:11), las=3 - ) - -bestAugNtHist <- hist(lincTable$bestAugNt, breaks=c(seq(0,2000,200), 1e6), plot=FALSE) -barplot(bestAugNtHist$counts, ylim=c(0,50), - main = "Absolute position of AUG start [nt]", - ylab = "Number of transcripts", - names.arg = c(lapply(seq(0,1800,200), function(x) { sprintf("%d-%d", x, x+200) }), ">2000"), las=3 - ) - -bestAugRelHist <- hist(lincTable$bestAugRel, breaks=seq(0,1,0.1), plot=FALSE) -barplot(bestAugRelHist$counts, ylim=c(0,30), - main = "Relative position of AUG start", - ylab = "Number of transcripts", - names.arg = c(sapply(seq(0,9), function(i) { c(sprintf("%0.d%%", i*10)) })), las=3 - ) - -bestAugFractHist <- hist(lincTable$bestAugFract, breaks=seq(0,1,0.05), plot=FALSE) -barplot(bestAugFractHist$counts, ylim=c(0,50), - main = "Relative index of AUG start", - xlab = "Relative AUG index", - ylab = NA, - names.arg = c(sapply(seq(0,9), function(i) { c("", sprintf("%0.0f%%", 5 + i*10)) })), las=3 - ) - -print(table(lincTable$bestAugOrder)) -print(sum(lincTable$bestAugOrder == 1)) -print(sum(lincTable$bestAugOrder >= 2 & lincTable$bestAugOrder <= 5)) -print(sum(lincTable$bestAugOrder >= 6 & lincTable$bestAugOrder <= 20)) -print(sum(lincTable$bestAugOrder >= 21 & lincTable$bestAugOrder <= 100)) - -upTotal <- sum( lincTable$upFrame0, lincTable$upFrame1, lincTable$upFrame2, na.rm=TRUE) -upFrame0 <- sum( lincTable$upFrame0, na.rm=TRUE) / upTotal -upFrame1 <- sum( lincTable$upFrame1, na.rm=TRUE) / upTotal -upFrame2 <- sum( lincTable$upFrame2, na.rm=TRUE) / upTotal - -orfTotal <- sum( lincTable$orfFrame0, lincTable$orfFrame1, lincTable$orfFrame2, na.rm=TRUE) -orfFrame0 <- sum( lincTable$orfFrame0, na.rm=TRUE) / orfTotal -orfFrame1 <- sum( lincTable$orfFrame1, na.rm=TRUE) / orfTotal -orfFrame2 <- sum( lincTable$orfFrame2, na.rm=TRUE) / orfTotal - -downTotal <- sum( lincTable$downFrame0, lincTable$downFrame1, lincTable$downFrame2, na.rm=TRUE) -downFrame0 <- sum( lincTable$downFrame0, na.rm=TRUE) / downTotal -downFrame1 <- sum( lincTable$downFrame1, na.rm=TRUE) / downTotal -downFrame2 <- sum( lincTable$downFrame2, na.rm=TRUE) / downTotal - -framing <- matrix( data = c( upFrame0, upFrame1, upFrame2, - orfFrame0, orfFrame1, orfFrame2, - downFrame0, downFrame1, downFrame2 ), - nrow = 3, ncol = 3 ) - -barplot(framing, beside=TRUE, - - main = "Footprint frame versus AUG start, lincRNAs", - ylab = "Fraction of reads", - ylim = c(0,0.75), - names.arg = c("", "Upstream", "", "", "ORF", "", "", "Downstream", "") - ) - -upTotal <- sum( pcodingTable$upFrame0, pcodingTable$upFrame1, pcodingTable$upFrame2, na.rm=TRUE) -upFrame0 <- sum( pcodingTable$upFrame0, na.rm=TRUE) / upTotal -upFrame1 <- sum( pcodingTable$upFrame1, na.rm=TRUE) / upTotal -upFrame2 <- sum( pcodingTable$upFrame2, na.rm=TRUE) / upTotal - -orfTotal <- sum( pcodingTable$orfFrame0, pcodingTable$orfFrame1, pcodingTable$orfFrame2, na.rm=TRUE) -orfFrame0 <- sum( pcodingTable$orfFrame0, na.rm=TRUE) / orfTotal -orfFrame1 <- sum( pcodingTable$orfFrame1, na.rm=TRUE) / orfTotal -orfFrame2 <- sum( pcodingTable$orfFrame2, na.rm=TRUE) / orfTotal - -downTotal <- sum( pcodingTable$downFrame0, pcodingTable$downFrame1, pcodingTable$downFrame2, na.rm=TRUE) -downFrame0 <- sum( pcodingTable$downFrame0, na.rm=TRUE) / downTotal -downFrame1 <- sum( pcodingTable$downFrame1, na.rm=TRUE) / downTotal -downFrame2 <- sum( pcodingTable$downFrame2, na.rm=TRUE) / downTotal - -framing <- matrix( data = c( upFrame0, upFrame1, upFrame2, - orfFrame0, orfFrame1, orfFrame2, - downFrame0, downFrame1, downFrame2 ), - nrow = 3, ncol = 3 ) - -barplot(framing, beside=TRUE, - - main = "Footprint frame versus AUG start, PCoding", - ylab = "Fraction of reads", - ylim = c(0,0.75), - names.arg = c("", "Upstream", "", "", "ORF", "", "", "Downstream", "") - ) - -dev.off() - -pdf("mesc-linc-orf-analysis.pdf", width = 6, height = 6, useDingbats = FALSE) - -plot(log10(lincTable$expInOrf), log10(lincTable$obsInOrf), - xlab = "Expected CHX footprints in ORF", - ylab = "Observed CHX footprints in ORF", - xlim = c(0,3), ylim = c(0,3), - axes = F) -abline(a=0,b=1) -axis(1, log10(10**seq(0,3)), labels=c("1", "10", "100", "1k"), cex.axis=1) -axis(1, log10(c(sapply(10**seq(0,2), minor_decade))), labels=F, tcl=-0.5) -axis(2, log10(10**seq(0,3)), labels=c("1", "10", "100", "1k"), cex.axis=1) -axis(2, log10(c(sapply(10**seq(0,2), minor_decade))), labels=F, tcl=-0.5) - -dev.off() diff --git a/src/FLOSS_score/orf-organization.R b/src/FLOSS_score/orf-organization.R deleted file mode 100644 index b9efc5b3..00000000 --- a/src/FLOSS_score/orf-organization.R +++ /dev/null @@ -1,205 +0,0 @@ -library("Rsamtools") -library("Biostrings") -library("rtracklayer") - -source("../shared/transcript-utils.R") -source("../shared/footprint-assignment.R") -source("../shared/floss.R") -source("mouse-annotation.R") - -emetFloss <- read.csv("data/mesc-emet-floss.txt", check.names=FALSE, row.names=1) - -xref <- match(row.names(emetFloss), annots$ensembl_transcript_id) -table(is.na(xref)) -trxs <- cbind(emetFloss, annots[xref,]) -trxs$name <- row.names(trxs) - -pcoding = trxs[trxs$name %in% codingIds & !(trxs$name %in% mitoIds) & !(trxs$name %in% dirtyPCodingIds),] -lincs = trxs[trxs$name %in% lincIds & !(trxs$name %in% dirtyLincIds),] - -emetCutoffs <- flossCutoffs(pcoding$cdsNRead, pcoding$cdsScore) - -pcoding$classifyTrx <- emetCutoffs$classify(pcoding$trxNRead, pcoding$trxScore) -lincs$classifyTrx <- emetCutoffs$classify(lincs$trxNRead, lincs$trxScore) - -options(mc.cores=36) - -bedFile <- "/mnt/ingolialab/ingolia/RiboIP/Alignment-mesc-130709/Mm.GRCm38.72.bed" -##lincBedFile <- "data/orflincs.bed" -mescFaFile <- "/mnt/ingolialab/ingolia/Genomes/SpikeIn/mm10-scspike.fa" - -chxBamFile <- BamFile("/mnt/ingolialab/ingolia/RiboIP/Alignment-mesc-130709/ribo_chx_mesc_genome_sorted.bam") -harr120BamFile <- BamFile("/mnt/ingolialab/ingolia/RiboIP/Alignment-mesc-130709/ribo_harr120_oldmesc_genome_sorted.bam") -harr150BamFile <- BamFile("/mnt/ingolialab/ingolia/RiboIP/Alignment-mesc-130709/ribo_harr150_oldmesc_genome_sorted.bam") - -allBed <- import(bedFile, format="BED", asRangedData = FALSE) - -pcodingBed <- allBed[match(pcoding$name, allBed$name)] -lincsBed <- allBed[match(lincs$name, allBed$name)] - -# A sites from Cell 2011 data sets (harringtonine) -harrASites <- data.frame(asite=c(14,14,15,16,16,16,17,17), row.names=seq(28,35)) -chxASites <- data.frame(asite=c(16,16,17,17,17), row.names=seq(28,32)) - -getTrxSeq <- function(faidxfile, trx) { - fa <- open(FaFile(faidxfile)) - ss <- getSeq(fa, param=GRanges(seqnames=seqnames(trx), ranges=ranges(trx))) - if (as.factor(strand(trx)) == "-") { - toString(reverseComplement(unlist(ss))) - } else { - toString(unlist(ss)) - } -} - -augIndices <- function(sequ) { - gregexpr("ATG", sequ)[[1]] -} - -harrScore <- function(harrDensity, augidx) { - startOcc <- sum(harrDensity[augidx + (2:4)]) - after <- sum(harrDensity[augidx + (5:7)]) - before <- sum(harrDensity[(max(0, augidx - 4)):(augidx + 1)]) - ifelse( startOcc > max(0, before, after), startOcc, NA ) -} - -augBestScoreIndex <- function(harrDensity120, harrDensity150, sequ) { - augs <- augIndices(sequ) - augScores <- lapply(augs, function(aug) { harrScore(harrDensity120, aug) + harrScore(harrDensity150, aug) }) - bestAug <- which.max(augScores) - ifelse( length(bestAug) == 1, augs[bestAug], NA ) -} - -inFrameStopIndex <- function(sequ, augidx) { - isstop <- Vectorize(function(idx) { substr(sequ, idx, idx+2) %in% c("TGA", "TAG", "TAA") }) - frame <- seq(augidx, nchar(sequ) - 2, 3) - stops <- which(isstop(frame)) - if (length(stops) > 0) frame[min(stops)] else NA -} - -ntToRelativeCodonProfile <- function(ntprof, startidx) { - lb <- - (startidx %/% 3) - ub <- (1 + length(ntprof) - startidx) %/% 3 - relprof <- data.frame( relpos = lb:ub ) - relprof$counts <- lapply(relprof$relpos, function(rc) { sum(ntprof[max(0, startidx + 3*rc - 1):(max(0, startidx + 3*rc + 1))]) }) - relprof -} - -analyzeTrx <- function( trxGR ) { - sequ <- getTrxSeq(mescFaFile, trxGR) - - profChx <- getASiteProfile(chxASites, chxBamFile, trxGR) - prof120 <- getASiteProfile(harrASites, harr120BamFile, trxGR) - prof150 <- getASiteProfile(harrASites, harr150BamFile, trxGR) - - res <- list(trxLength = nchar(sequ), - trxChxTotal = sum(profChx) ) - - bestAug <- augBestScoreIndex( prof120, prof150, sequ ) - - if (!is.na(bestAug)) { - res$bestAugHarr = sum(prof120[(bestAug + 2):(bestAug + 4)], prof150[(bestAug + 2):(bestAug + 4)]) - - res$bestAugNt = bestAug - res$bestAugRel = (bestAug - 1) / (nchar(sequ) - 1) - - augIdxs <- augIndices(sequ) - res$bestAugOrder = match(bestAug, augIdxs) - res$bestAugFract = (res$bestAugOrder - 1) / (length(augIdxs) - 1) - - codonRelProf <- ntToRelativeCodonProfile(prof120 + prof150, bestAug) - augCounts <- codonRelProf$counts[match(1, codonRelProf$relpos)][[1]] - nabove <- sum(codonRelProf$counts > augCounts, na.rm=T) - res$bestAugCodonRank <- 1 + nabove - - stopNt <- inFrameStopIndex(sequ, bestAug) - res$stopNt <- stopNt - if (is.na(stopNt)) stopNt <- nchar(sequ) - res$obsInOrf <- sum(profChx[(bestAug + 2):(stopNt + 1)]) - res$expInOrf <- res$trxChxTotal * ( stopNt - bestAug ) / nchar(sequ) - - res$upFrame0 <- sum(profChx[seq(bestAug + 0, 1, -3)], na.rm=TRUE) - res$upFrame1 <- sum(profChx[seq(bestAug + 1, 1, -3)], na.rm=TRUE) - res$upFrame2 <- sum(profChx[seq(bestAug - 1, 1, -3)], na.rm=TRUE) - - res$orfFrame0 <- sum(profChx[seq(bestAug + 3, stopNt + 0, 3)], na.rm=TRUE) - res$orfFrame1 <- sum(profChx[seq(bestAug + 4, stopNt + 1, 3)], na.rm=TRUE) - res$orfFrame2 <- sum(profChx[seq(bestAug + 2, stopNt - 1, 3)], na.rm=TRUE) - - res$downFrame0 <- sum(profChx[seq(min(stopNt + 3, nchar(sequ)), nchar(sequ), 3)], na.rm=TRUE) - res$downFrame1 <- sum(profChx[seq(min(stopNt + 4, nchar(sequ)), nchar(sequ), 3)], na.rm=TRUE) - res$downFrame2 <- sum(profChx[seq(min(stopNt + 2, nchar(sequ)), nchar(sequ), 3)], na.rm=TRUE) - } else { - res$bestAugHarr <- NA - - res$bestAugNt <- NA - res$bestAugRel <- NA - - res$bestAugOrder <- NA - res$bestAugFract <- NA - - res$bestAugCodonRank <- NA - - res$stopNt <- NA - res$obsInOrf <- NA - res$expInOrf <- NA - - res$upFrame0 <- NA - res$upFrame1 <- NA - res$upFrame2 <- NA - - res$orfFrame0 <- NA - res$orfFrame1 <- NA - res$orfFrame2 <- NA - - res$downFrame0 <- NA - res$downFrame1 <- NA - res$downFrame2 <- NA - } - - res -} - -bedAnalysis <- function(bed) { - rows <- mclapply( 1:length(bed), function(idx) { - trx <- transcriptGRanges(bed[idx]) - names(trx) <- bed$name[[idx]] - cat(sprintf("%6d\t%s\n", idx, names(trx)[[1]])) - res <- analyzeTrx(trx) - res$trx <- names(trx)[[1]] - res - }) - - analysis <- as.data.frame(do.call(rbind, rows)) - - data.frame( trx = as.vector( analysis$trx, mode="character" ), - trxLength = as.vector( analysis$trxLength, mode="integer" ), - trxChxTotal = as.vector( analysis$trxChxTotal, mode="integer" ), - bestAugHarr = as.vector( analysis$bestAugHarr, mode="integer" ), - bestAugNt = as.vector( analysis$bestAugNt, mode="integer" ), - bestAugRel = as.vector( analysis$bestAugRel, mode="numeric" ), - bestAugOrder = as.vector( analysis$bestAugOrder, mode="integer" ), - bestAugFract = as.vector( analysis$bestAugFract, mode="numeric" ), - bestAugCodonRank = as.vector( analysis$bestAugCodonRank, mode="integer" ), - stopNt = as.vector( analysis$stopNt, mode="integer" ), - obsInOrf = as.vector( analysis$obsInOrf, mode="integer" ), - expInOrf = as.vector( analysis$expInOrf, mode="integer" ), - upFrame0 = as.vector( analysis$upFrame0, mode="integer" ), - upFrame1 = as.vector( analysis$upFrame1, mode="integer" ), - upFrame2 = as.vector( analysis$upFrame2, mode="integer" ), - orfFrame0 = as.vector( analysis$orfFrame0, mode="integer" ), - orfFrame1 = as.vector( analysis$orfFrame1, mode="integer" ), - orfFrame2 = as.vector( analysis$orfFrame2, mode="integer" ), - downFrame0 = as.vector( analysis$downFrame0, mode="integer" ), - downFrame1 = as.vector( analysis$downFrame1, mode="integer" ), - downFrame2 = as.vector( analysis$downFrame2, mode="integer" ) - ) -} - -pcodingTable <- bedAnalysis(pcodingBed) -pcodingTable <- cbind.data.frame(pcoding, pcodingTable) -write.csv(pcodingTable, "data/mesc-pcoding-aug-analysis.txt") - -lincsTable <- bedAnalysis(lincsBed) -lincsTable <- cbind.data.frame(lincs, lincsTable) -write.csv(lincsTable, "data/mesc-lincs-aug-analysis.txt") - diff --git a/src/FLOSS_score/scspike.sh b/src/FLOSS_score/scspike.sh deleted file mode 100644 index b4169bf6..00000000 --- a/src/FLOSS_score/scspike.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -set -e - -# http://downloads.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_R64-1-1_20110203.tgz -# With ugly Fasta headers converted to chr01 etc. - -sed 's/^>chr/>saccer/' ../Saccharomyces_cerevisiae/YeastGenome/saccharomyces_cerevisiae.fa \ - > sac_cer_spike.fa - -sed 's/^chr/saccer/' ../Saccharomyces_cerevisiae/YeastGenome/saccharomyces_cerevisiae.gtf \ - > sac_cer_spike.gtf - -cat ../Mus_musculus/mm9.fa sac_cer_spike.fa > mm9-scspike.fa -cat ../Mus_musculus/mm9_knownGene.gtf sac_cer_spike.gtf > mm9-scspike_knownGene.gtf - -cat /mnt/sequence/genomes/mouse/mm10.fa sac_cer_spike.fa > mm10-scspike.fa - -cat ../Homo_sapiens/hg19.fa sac_cer_spike.fa > hg19-scspike.fa -cat ../Homo_sapiens/hg19_knownGene.gtf sac_cer_spike.gtf > hg19-scspike_knownGene.gtf - -bowtie-build mm9-scspike.fa mm9-scspike -bowtie-build hg19-scspike.fa hg19-scspike - -cat ../Saccharomyces_cerevisiae/sc-rrna.fa ../Mus_musculus/mm_rrna.fa > mm-scspike-rrna.fa -cat ../Saccharomyces_cerevisiae/sc-rrna.fa ../Homo_sapiens/refseq_rrna.fa > hs-scspike-rrna.fa - -bowtie-build mm-scspike-rrna.fa mm-scspike-rrna -bowtie-build hs-scspike-rrna.fa hs-scspike-rrna -bowtie-build mm10-scspike.fa mm10-scspike -bowtie2-build mm10-scspike.fa mm10-scspike -- GitLab