From cc6497621281f177e2e9f1a09b521b0bfeb4fe8c Mon Sep 17 00:00:00 2001
From: jplantad <julie.plantade@ens-lyon.fr>
Date: Tue, 25 May 2021 15:07:01 +0200
Subject: [PATCH] compress, index and merge

---
 compress_vcf_to_gz.sh | 36 ++++++++++++++++++++++++++++++++++++
 generate_index.sh     | 17 +++++++++++++++++
 merge_fastafiles.sh   | 14 ++++++++++++++
 3 files changed, 67 insertions(+)
 create mode 100755 compress_vcf_to_gz.sh
 create mode 100755 generate_index.sh
 create mode 100755 merge_fastafiles.sh

diff --git a/compress_vcf_to_gz.sh b/compress_vcf_to_gz.sh
new file mode 100755
index 0000000..1462df2
--- /dev/null
+++ b/compress_vcf_to_gz.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# set home directory
+home="/home/stagiaire/Bureau/phylogenetics/"
+cd ${home}
+
+# list the vcf that need to be compressed
+data_999_folder="/home/stagiaire/Bureau/gitlab/data_fixed_999/"
+vcf_list=$(ls ${data_999_folder})
+# check the list
+echo ${vcf_list}
+
+# create a new folder that will contain the gz files, and get into it
+mkdir -p data_999_gz
+cd data_999_gz/
+# check working directory
+pwd
+
+for vcf in ${vcf_list}
+do
+	echo -e "#####\nProcessing "${vcf}
+	# suppress the GL line in the VCF file header and save the output into a temporary VCF file
+	sed '/^##FORMAT=<ID=GL/d' ${data_999_folder}${vcf} > ${vcf}_tmp.vcf
+	echo "temporary VCF file created"
+	# compress the temporary VCF file
+	bcftools view ${vcf}_tmp.vcf -Oz -o ${vcf}.gz
+	echo "compressed file computed"
+	# remove the temporary VCF file
+	rm ${vcf}_tmp.vcf
+	echo "temporary VCF file deleted"
+	echo -e ${vcf}" processed.\n#####"
+done
+
+# check that the compressed files are in the folder
+ls -l
+
diff --git a/generate_index.sh b/generate_index.sh
new file mode 100755
index 0000000..6284763
--- /dev/null
+++ b/generate_index.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# generate an index for a VCF file
+
+# set home directory
+home="/home/stagiaire/Bureau/phylogenetics/data_999_gz/"
+cd ${home}
+
+# list VCF files
+vcf_list=$(ls )
+
+for vcf in ${vcf_list}
+do
+	# index the VCF file
+	bcftools index -f ${vcf} -o ${vcf}.csi
+	echo "##### "${vcf}" done."
+done
diff --git a/merge_fastafiles.sh b/merge_fastafiles.sh
new file mode 100755
index 0000000..993eb1a
--- /dev/null
+++ b/merge_fastafiles.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# merging fasta files of all individuals for each gene
+
+# set home directory
+home="/home/stagiaire/Bureau/phylogenetics/data_sequences/pon/"
+cd ${home}
+
+indiv_list=$(ls *renamed_AB.fa)
+
+for indiv in ${indiv_list}
+do
+	cat ${indiv} >> "BST2_ponAbe2_all.fa"
+done
-- 
GitLab