diff --git a/src/non-b/README_basedo.Rmd b/src/non-b/README_basedo.Rmd deleted file mode 100644 index 00cb78459f09f2d7d3907118b12de304ce48bf65..0000000000000000000000000000000000000000 --- a/src/non-b/README_basedo.Rmd +++ /dev/null @@ -1,201 +0,0 @@ ---- -title: "readme_basedo.Rmd" -author: "Sirvent Matthaus" -date: "2024-02-21" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Project Overview - -This project aims to analyze non-b DNA structures and repeats across the hg38 whole genome. We gather a bunch of data available from the scientific community, and formatted them to .bed files. Theses files are used in our analyse with bedtool intersect to find the presence of non-b structure across our annotation and plot it depending multiples community of genes. - -### Database Files Description - -The database contain the following files: - -```{r, echo=FALSE} -# Create the data frame -dataset <- data.frame( - Source = c("gquadR", "gquadR", "gquadR", "gquadR", "gquadR", "gquadR", - "non-b_db", "non-b_db", "non-b_db", "non-b_db", "non-b_db", "non-b_db", "non-b_db", - "G4_Grinder", "G4_Grinder", "G4_Grinder", "G4_Grinder", - "repbase", "repbase", - "IM_seeker", "IM_seeker", "IM_seeker", "IM_seeker", "IM_seeker", "IM_seeker", "IM_seeker", "IM_seeker", - "dfam", "allquads"), - Structure_Content = c("A_phase_repeat", "hdna", "slipped", "short_tandem_repeat", "triplex", "zdna", - "A_phase_repeat", "direct_repeat", "g_quadruplex", "inverted_repeat", "mirror_repeat", "short_tandem_repeat", "zdna", - "i_motif", "i_motif", "g_quadruplex", "g_quadruplex", - "repeats", "repeats", - "i-motif", "i-motif", "i-motif", "i-motif", "i-motif", "i-motif", "i-motif", "i-motif", - "repeats", "g_quadruplex"), - File = c("gquadR_APR.bed", "gquadR_HDNA.bed", "gquadR_slipped.bed", "gquadR_STR.bed", "gquadR_TFO.bed", "gquadR_ZDNA.bed", - "non-b_db_APR.bed", "non-b_db_DR.bed", "non-b_db_GQ.bed", "non-b_db_IR.bed", "non-b_db_MR.bed", "non-b_db_STR.bed", "non-b_db_Z.bed", - "PiMS_PQSM2a_all_chromosomes.csv", "PiMS_PQSM3a_all_chromosomes.csv", "PQS_PQSM2a_all_chromosomes.csv", "PQS_PQSM3a_all_chromosomes.csv", - "repbase_allchr.bed", "repbase_repeatmasker.bed", - "unfirmed_iM_Greedy_nonoverlapped_conformationA.bed", "unfirmed_iM_Greedy_nonoverlapped_conformationB.bed", - "unfirmed_iM_nonGreedy_nonoverlapped_conformationA.bed", "unfirmed_iM_nonGreedy_nonoverlapped_conformationB.bed", - "firmed_iM_nonGreedy_nonoverlapped_conformationB.bed", "firmed_iM_nonGreedy_nonoverlapped_conformationA.bed", - "firmed_iM_Greedy_nonoverlapped_conformationB.bed", "firmed_iM_Greedy_nonoverlapped_conformationA.bed", - "dfam_repeat.bed", "alquads_hg19.bed") -) - -# Print the table -knitr::kable(dataset, caption = "Overview of Datasets files") -``` - -Each file is in .bed format where the 4th column is a name or an ID of the provenance of the structure: - -```{r, echo=FALSE} -# Create the data frame -specific_dataset <- data.frame( - Chromosome = c("chr1", "chr1", "chr1", "chr1", "chr10", "chr10", "chr10", "chr10"), - Start = c(10008, 10050, 10092, 10133, 10001, 10314, 10733, 18858), - End = c(10048, 10090, 10131, 10174, 10374, 10429, 10794, 18887), - Type = c("firmed_iM_Greedy_nonoverlapped_conformationA", "firmed_iM_Greedy_nonoverlapped_conformationA", - "firmed_iM_Greedy_nonoverlapped_conformationA", "firmed_iM_Greedy_nonoverlapped_conformationA", - "Direct_Repeat", "Direct_Repeat", "Direct_Repeat", "Direct_Repeat") -) - -# Print the table with caption -knitr::kable(specific_dataset, caption = "Overview of structure of the .bed files") - -``` - -### Software and Libraries Required - -To conduct the analysis, the following software and libraries are required: - -``` -Bash -Libraries: Bedtools - -Python -Libraries: pandas, defaultdict, matplotlib, seaborn -``` - -## Data Loading and Preprocessing - -All the database has been formatted and is ready to use, all preprocessing are different for each source of data but usually it consist in awk manipulation of data, or python script for harder structures of files - -```{bash, eval=FALSE} -structures=("APR" "DR" "GQ" "IR" "MR" "STR" "Z") -#concatenated each str chr on one file -for i in ${structures[@]}; do cat *${i}.tsv > ${i}_all.tsv; done -#parse the header -for i in *_all.tsv ; do grep -v 'Sequence' $i > ${i}_clean;done -#bed format -for i in *_clean ;do awk 'BEGIN{OFS="\t"} {print $1,$4,$5,$3}' $i > ${i}.bed ;done - -``` - -``` -def create_bed_from_file(input_file_path, output_file_path): - import os - - # Extract the base name of the file without the extension to use as the 'name' field - base_name = os.path.splitext(os.path.basename(input_file_path))[0] - - # Open the input file for reading - with open(input_file_path, 'r') as file: - # Assuming there's no header to skip, but if there is, uncomment the next line - next(file) - - # Open the output file for writing - with open(output_file_path, 'w') as output: - for line in file: - # Split each line into components based on the delimiter (assuming '|') - parts = line.strip().split('|') - - # Extracting chromosomal information - chr_name = parts[0] # Chromosome name - start = str(int(parts[1]) - 1) # BED format uses 0-based start position - stop = parts[2] # Stop position - - # Construct the BED line (chr, start, stop, name) - bed_line = '\t'.join([chr_name, start, stop, base_name]) + '\n' - output.write(bed_line) - -# Example usage (paths need to be updated to actual file paths for real use) -input_file_path = '/home/matthaus/Documents/data/basedo/IMSEEKER/classification_dataset_iMSeeker/unfirmed_iM_nonGreedy_nonoverlapped_conformationB.txt' # Update this to your actual input file path -output_file_path = '/home/matthaus/Documents/data/basedo/IMSEEKER/classification_dataset_iMSeeker/unfirmed_iM_nonGreedy_nonoverlapped_conformationB.bed' # Update this to your desired output file path -``` - -## Analysis - -### 1st script: Intersect - -After the data gathering and the database creation, our analyse takes .bed files with location of non-b structures on hg38, in order to do a bedtools intersect with our annotation. - - - -That's the job of :"chia-pet_network/src/non-b/str_intersect_dblbed.sh" - -it takes 2 arguments : annotation.bed structure.bed - -```{bash, eval=FALSE} -bash str_intersect_dblbed.sh chia-pet_network/data/non-b/annot/genes.bed chia-pet_network/data/non-b/basedo/PiMS_PQSM2a_all_chromosomes.csv -``` - -The resulting file is a intersect double .bed that will be useful in next scripts - -```{r, echo=FALSE} -# Creating the data frame -overlap_data <- data.frame( - Chromosome = rep("chr1", 5), - Start = rep(913922, 5), - End = rep(954575, 5), - Gene_ID = rep("ENSG00000187634", 5), - Gene_Name = rep("SAMD11", 5), - Strand = rep("+", 5), - Overlap_Chromosome = rep("chr1", 5), - Overlap_Start = c(913975, 914112, 914158, 914310, 914322), - Overlap_End = c(914006, 914152, 914190, 914359, 914361), - Feature = rep("eHG4", 5) -) - -# Printing the table with kable -knitr::kable(overlap_data, caption = "Intersect of annotation and structure bed") - -``` - -### 2nd script : pandas database - -After acquiring the intersects file, the second file is made to create dataframe in pandas in order to plot the content of non-b structures across communities. - -chia-pet_network/src/non-b/str_stats.py is composed of 4 functions: - -df_maker(filepath) takes an intersect file and parse it to return a final_df that is a merged dataframe of number and ratio of non-b structures on each genes - -read_community_genes(filename) takes a community file and create a default dict with these information - -comm_filter(str_df, comu_dict) takes the final_df created by the first function and the community_dict created in the second one, and filters each genes by his community, taking only 10% lower values to avoid aberant ones. It return a df used in the next script - -comm_filter_norm(str_df, comu_dict) does the same job as the previous one but normalised datas by structures lengths - -### 3rd script : Plot generator - -After generating the data frames of non-b structures counts on each communities it's time to plot it ! This scripts can call previous function so you'll only need to generate intersect files, then call this script function as: - -``` -plot_all(intersect_file, community_file) -``` - -chia-pet_network/src/non-b/plot_generator.py use previous function to generate 4 plots: - - - -## Conclusion - -## Contact Information - -For further information or queries, please contact: - -- Name: Sirvent Matthaus - -- Email: Matthaus.sirvent\@lyon-ens.fr - -- GitHub: msirvent diff --git a/src/non-b/chromoMap.R b/src/non-b/chromoMap.R deleted file mode 100644 index e3ddae6bff183d70e402c60dcd6414e35d564acf..0000000000000000000000000000000000000000 --- a/src/non-b/chromoMap.R +++ /dev/null @@ -1,22 +0,0 @@ -library("gquad") - -fasta_files <- list.files("/home/matthaus/Documents/data/annot/chroms/", pattern = "chr.*\\.fasta$", full.names = TRUE) -output_directory <- "/home/matthaus/Documents/data/annot/chroms/testgquad/" - -for (file in fasta_files) { - # Assuming str(file, xformat = "fasta") returns a data frame - data <- gquad(file, xformat = "fasta") - - # Extracting filename without extension - filename <- tools::file_path_sans_ext(basename(file)) - - # Creating TSV filename with output directory - tsv_filename <- paste0(output_directory, filename, ".tsv") - - # Writing data frame to TSV file - write.table(data, tsv_filename, sep = "\t", quote = FALSE, row.names = FALSE) -} - -#for i in *; do grep -v "n" ${i} > $(basename ${i})_won.tsv ; done -#for i in *won*; do awk -v i="$i" ' BEGIN{OFS="\t"; split(i, parts, ".")} {a=$1+$3} {print parts[1], $1,a,$2}' $i;done -#cat *.bed >allchr.bed diff --git a/src/non-b/density_plots.py b/src/non-b/density_plots.py new file mode 100644 index 0000000000000000000000000000000000000000..431bef3ce1f579b789911a84905c72c1481049a4 --- /dev/null +++ b/src/non-b/density_plots.py @@ -0,0 +1,692 @@ +from collections import defaultdict + +import pandas as pd + +import matplotlib.pyplot as plt + +import seaborn as sns + +import numpy as np + +import os + +from matplotlib.backends.backend_pdf import PdfPages + +import plotly.graph_objects as go + +intersect="/home/matthaus/Documents/data/basedo/BED_files/intersect/reshape/" + +files= [file for file in os.listdir(intersect) if os.path.isfile(os.path.join(intersect, file))] + +comu="/home/matthaus/Documents/data/community_SPIN.txt" + +def read_community_genes(filename): + """read a community genes file to return a dict as dgene[community]={ENS0001,ENS00013,ENS0876} + + Args: + filename (_type_): filename like comu="/home/matthaus/Documents/data/community_SPIN.txt" + + Returns: + _type_: dgene[community]={ENS0001,ENS00013,ENS0876} + """ + dgene = defaultdict(list) + + with open(filename, 'r') as file: + nbli = 0 + for line in file: + if nbli >= 1: + linesplit = line.split("\t") + gene_ids = linesplit[-2].split(",") + for gene_id in gene_ids: + dgene[linesplit[0]].append(gene_id.strip()) + nbli += 1 + + return dgene + + +def df_maker_percent(file_path): + """ + GOUPED BY MEAN OF THE % + """ + # Read the file into a DataFrame + df = pd.read_csv(file_path, sep='\t', header=None) + + if df.shape[1] == 11: + df.columns = ['chr_start', 'start', 'end', 'gene_id', 'gene_name', 'strand', + 'chr_end', 'structure_start', 'structure_end', 'struct', 'sequ'] + + # Convert 'sequ' to lowercase + df['sequ_lower'] = df['sequ'].str.lower() + + # Define nucleotides and their combinations + nucleotides = ['A', 'C', 'G', 'T', 'R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V'] + + # Count nucleotides + for nucleotide in nucleotides: + if nucleotide in ['R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V']: # For composite nucleotides, already calculated + continue + df[f'{nucleotide}_count'] = df['sequ_lower'].str.count(nucleotide.lower()) + + # Calculate composite nucleotide counts based on simple nucleotide counts + df['R_count'] = df['A_count'] + df['G_count'] + df['Y_count'] = df['C_count'] + df['T_count'] + df['S_count'] = df['C_count'] + df['G_count'] + df['W_count'] = df['A_count'] + df['T_count'] + df['K_count'] = df['T_count'] + df['G_count'] + df['M_count'] = df['A_count'] + df['C_count'] + df['B_count'] = df['C_count'] + df['G_count'] + df['T_count'] + df['D_count'] = df['A_count'] + df['G_count'] + df['T_count'] + df['H_count'] = df['A_count'] + df['C_count'] + df['T_count'] + df['V_count'] = df['A_count'] + df['G_count'] + df['C_count'] + # Calculate total nucleotides + df['total_nucleotides'] = df['A_count'] + df['C_count'] + df['G_count'] + df['T_count'] + + + # Calculate percentage of each nucleotide + for nucleotide in nucleotides: + df[f'{nucleotide}_percent'] = (df[f'{nucleotide}_count'] / df['total_nucleotides']) * 100 + + + + # Adjust the dictionary for aggregation to include the count operation correctly + aggregated_functions = {'start': 'size'} # Count occurrences using a column that exists for sure, like 'start' + # Include mean percentage calculations for all nucleotides + aggregated_functions.update({f'{n}_percent': 'mean' for n in nucleotides}) + # Include the sum calculations for gene_length and structure_length + #aggregated_functions.update({'gene_length': 'sum', 'structure_length': 'sum'}) + + # Perform the grouping and aggregation + percent_df = df.groupby('gene_id', as_index=False).agg(aggregated_functions) + + + # Rename the 'start' column to 'count' after aggregation to reflect the number of occurrences + percent_df = percent_df.rename(columns={'start': 'count'}) + + + return percent_df +#print(df_maker_percent(IR)) + + +def df_maker_density(file_path): + # Read the file into a DataFrame + df = pd.read_csv(file_path, sep='\t', header=None) + + if df.shape[1] == 11: + df.columns = ['chr_start', 'start', 'end', 'gene_id', 'gene_name', 'strand', + 'chr_end', 'structure_start', 'structure_end', 'struct', 'sequ'] + #df['gene_id'] = df['gene_id'].str.split('_').str[0]# IF exon needed to be treated on their gene + + # Calculate gene length and structure length + df['gene_length'] = df['end'] - df['start'] + 1 + df['structure_length'] = df['structure_end'] - df['structure_start'] + 1 + # Adjust the dictionary for aggregation to include the count operation correctly + aggregated_functions = { + 'start': 'size', # Count occurrences using a column that exists for sure, like 'start' + 'gene_length': 'first', # Keep the first occurrence of gene_length for each gene_id + 'structure_length': 'sum' # Sum the structure_length for each gene_id + } + # Perform the grouping and aggregation + grouped_df = df.groupby('gene_id', as_index=False).agg(aggregated_functions) + + # Rename the 'start' column to 'count' after aggregation to reflect the number of occurrences + grouped_df = grouped_df.rename(columns={'start': 'count'}) + grouped_df['count_norm'] = grouped_df['count'] / grouped_df['gene_length'] + grouped_df['length_norm'] = grouped_df['structure_length'] / grouped_df['gene_length'] + + + # Drop unnecessary columns if needed + grouped_df=grouped_df.drop(columns=['gene_length', 'structure_length']) + return grouped_df + +#print(df_maker_density(IRex)) + +def exon_com (comu_dict): + file="/home/matthaus/Documents/data/annot/exons.bed" + dgene = read_community_genes(comu_dict) + df = pd.read_csv(file, sep='\t', header=None) + df.columns = ['chr', 'start', 'end', 'exon_id', 'score', 'strand'] + # Step 1: Keep only the exon_id column + df = df[['exon_id']] + + # Step 2: Create the gene_id column based on exon_id + df['gene_id'] = df['exon_id'].apply(lambda x: x.split('_')[0]) + return df + + + + +def comm_filter(str_df, comu_dict): + df_struct = df_maker_density(str_df) + print(str_df.split("/")[-1]) + print("Size of Original DataFrame: (intersect) ", df_struct.shape) + dgene = read_community_genes(comu_dict) + dfs = [] + for k, v in dgene.items(): + community = k + # Initialize a DataFrame for all genes in the community with default values + all_genes_df = pd.DataFrame({'gene_id': v}) + # Merge with df_struct to find matches and fill missing values with 0 + kc = pd.merge(all_genes_df, df_struct, on='gene_id', how='left').fillna({'count': 0, 'structure_length': 0}) + kc['community'] = community # Assigning the community value + print(community) + dfs.append(kc) # Append the modified DataFrame to the list + result_df = pd.concat(dfs, ignore_index=True) + result_df = result_df.fillna(0) + # Calculate the threshold as the count value at the 90th percentile + print("Size of DataFrame: ", result_df.shape) + print("Number of times 'count' is 0: ", (result_df['count'] == 0).sum()) + return result_df + + + + + + + + + dfs = [] + for k, v in dgene.items(): + community = k + all_genes_df = pd.DataFrame({'gene_id': v}) + # Note: The merge now uses 'actual_gene_id' for matching + kc = pd.merge(merged_df,all_genes_df, left_on='actual_gene_id', right_on='gene_id', how='left') + kc = kc.fillna({'count': 0, 'structure_length': 0}) # Assuming count and structure_length need filling + kc['community'] = community + # Ensure we drop the temporary 'actual_gene_id' column if not needed in the final output + kc.drop('actual_gene_id', axis=1, inplace=True) + #kc.drop('gene_id_x', axis=1, inplace=True) + dfs.append(kc) + + result_df = pd.concat(dfs, ignore_index=True) + #print(result_df) + result_df = result_df.fillna(0) + #print("Size of DataFrame: ", result_df.shape) + #print("Number of times 'count' is 0: ", (result_df['count'] == 0).sum()) + return result_df + + +def plot_all(str_df, comu_dict, pdf): + result_df = comm_filter(str_df, comu_dict) + + if gene == "intersect_gene_": + threshold = result_df['count'].quantile(0.9) + # Drop rows with count values higher than the threshold + result_df = result_df[result_df['count'] <= threshold] + + file_basename = os.path.basename(str_df) + structure_name = os.path.splitext(file_basename)[0] + sns.set(style="whitegrid") + communities = result_df['community'].unique() + n_colors = len(communities) + palette = sns.color_palette("coolwarm", n_colors=n_colors) + + fig, axs = plt.subplots(2, 3, figsize=(20, 12)) + + sns.kdeplot(data=result_df, x="count", hue="community", fill=True, common_norm=False, palette=palette, ax=axs[0, 0]) + axs[0, 0].set_title(f"Density Curves of Counts for Each Community {community_name}- {source}") + + sns.kdeplot(data=result_df, x="count_norm", hue="community", fill=True, common_norm=False, palette=palette, ax=axs[0, 1]) + axs[0, 1].set_title(f"Density Curves of Counts Normalised {community_name}- {source}") + + sns.kdeplot(data=result_df, x="length_norm", hue="community", fill=True, common_norm=False, palette=palette, ax=axs[0, 2]) + axs[0, 2].set_title(f"Density Curves of Length Normalised {community_name}- {source}") + + sns.violinplot(data=result_df, x="community", y="count", inner="quartile", palette="Set3", ax=axs[1, 0]) + axs[1, 0].set_title(f"Violin Plot of Count for Each Community {community_name}- {source}") + axs[1, 0].tick_params(axis='x', rotation=45) + + sns.violinplot(data=result_df, x="community", y="count_norm", inner="quartile", palette="Set3", ax=axs[1, 1]) + axs[1, 1].set_title(f"Violin Plot of Counts Normalised {community_name}- {source}") + axs[1, 1].tick_params(axis='x', rotation=45) + + sns.violinplot(data=result_df, x="community", y="length_norm", inner="quartile", palette="Set3", ax=axs[1, 2]) + axs[1, 2].set_title(f"Violin Plot of Length Normalised {community_name}- {source}") + axs[1, 2].tick_params(axis='x', rotation=45) + plt.tight_layout() + pdf.savefig(fig) # Save the current figure into the PDF file + #plt.show() + plt.close() + + + + +communities = { + "/home/matthaus/Documents/data/community_SPIN.txt": "spin", + "/home/matthaus/Documents/data/community/dendro_gene_nt_ward_euclidean_tr50.0_Sordered.txt": "com23" +} + +inters="/home/matthaus/Documents/data/basedo/BED_files/intersect/reshape/" +gene="intersect_gene_" +#"allquads": [f"{inters}{gene}",f"{inters}{gene}",f"{inters}{gene}",f"{inters}{gene}"], +files={ +"allquads": ["allquads_G4AAAA.bed","allquads_G4AABB.bed","allquads_G4ABAA.bed","allquads_G4ABAB.bed","allquads_G4ABBA.bed","allquads_G4ABBB.bed","allquads_G4BAAA.bed","allquads_G4BABA.bed","allquads_G4BABB.bed","allquads_G4BBAA.bed"], +"G4": ["endoquad_G4.bed","imgrinder_GQ.bed","non-bdb_GQ.bed"], +"IM": ["imgrinder_IM.bed","imseeker_imotif.bed"], +"palindrome": ["palindrome_cruci.bed","non-bdb_cruci.bed"], +"TFO": ["gquadR_TFO.bed"], +"HDNA": ["gquadR_HDNA.bed","non-bdb_triplex.bed"], +"Z": ["gquadR_ZDNA.bed","non-bdb_Z.bed"], +"slipped": ["gquadR_slipped.bed","non-bdb_slipped.bed"], +"Satellite": ["dfam_Satellite.bed","repbase_Satellite.bed"], +"APR": ["gquadR_APR.bed","non-bdb_APR.bed"], +"STR": ["gquadR_STR.bed","non-bdb_STR.bed"], +"DR": ["non-bdb_DR.bed"], +"IR": ["palindrome.bed","non-bdb_IR.bed"], +"MR": ["non-bdb_MR.bed"] +} + + +stats_df=pd.DataFrame() +for struct, sources in files.items(): + for source in sources: + directory = "/home/matthaus/Documents/data/basedo/BED_files/intersect/reshape/" + file_name = f"{directory}{gene}{source}" + df=comm_filter(file_name,comu) + # Your calculations + min_val = df['count'].min() + max_val = df['count'].max() + median_val = df['count'].median() + first_quartile_val = df['count'].quantile(0.25) + third_quartile_val = df['count'].quantile(0.75) + total=df['count'].sum() + # Step 1: Total number of rows + total_rows = len(df) + + # Step 2: Number of rows where 'count' is 0 + rows_with_count_zero = (df['count'] == 0).sum() + + # Step 3: Calculate the percentage + zero = (rows_with_count_zero / total_rows) * 100 + + # Create a DataFrame with these values + new_stats = { + 'Min': [min_val], + 'First Quartile': [first_quartile_val], + 'Median': [median_val], + 'Third Quartile': [third_quartile_val], + 'Max': [max_val], + '0%':[zero], + 'total':[total] + } + new_row_df = pd.DataFrame(new_stats, index=[source]) + # Save the DataFrame to a CSV file + stats_df = pd.concat([stats_df, new_row_df]) + stats_df.to_csv('/home/matthaus/Documents/chia-pet_network/results/nonb/V4/stats_summary_gene.csv', index=True,sep="\t") + + + +for struct, sources in files.items(): + # Define the PDF path outside the community loop, so one PDF is created per structure + pdf_path = f'/home/matthaus/Documents/chia-pet_network/results/nonb/V4/gene/{struct}_densityplot.pdf' + with PdfPages(pdf_path) as pdf: + for community_path, community_name in communities.items(): + # Initialize a list to store dataframes for each source within the current community + sources_dataframes = [] + + for source in sources: + directory = "/home/matthaus/Documents/data/basedo/BED_files/intersect/reshape/" + file_name = f"{directory}{gene}{source}" + + # Assuming plot_all function generates and directly saves the plot to the PDF + # If plot_all is not designed to handle PdfPages, you might need to adjust its implementation + plot_all(file_name, community_path, pdf) + +files={ +"allquads": ["allquads_G4AAAA.bed","allquads_G4AABB.bed","allquads_G4ABAA.bed","allquads_G4ABAB.bed","allquads_G4ABBA.bed","allquads_G4ABBB.bed","allquads_G4BAAA.bed","allquads_G4BABA.bed","allquads_G4BABB.bed","allquads_G4BBAA.bed"], +"G4": ["endoquad_G4.bed","imgrinder_GQ.bed","non-bdb_GQ.bed"], +"IM": ["imgrinder_IM.bed","imseeker_imotif.bed"], +"palindrome": ["palindrome_cruci.bed","non-bdb_cruci.bed"], +"TFO": ["gquadR_TFO.bed"], +"HDNA": ["gquadR_HDNA.bed","non-bdb_triplex.bed"], +"Z": ["gquadR_ZDNA.bed","non-bdb_Z.bed"], +"slipped": ["gquadR_slipped.bed","non-bdb_slipped.bed"], +"Satellite": ["dfam_Satellite.bed"], +"APR": ["non-bdb_APR.bed"], +"STR": ["gquadR_STR.bed","non-bdb_STR.bed"], +"DR": ["non-bdb_DR.bed"], +"IR": ["palindrome.bed","non-bdb_IR.bed"], +"MR": ["non-bdb_MR.bed"] +} +gene="intersect_prom_" + + +stats_df=pd.DataFrame() +for struct, sources in files.items(): + for source in sources: + directory = "/home/matthaus/Documents/data/basedo/BED_files/intersect/reshape/" + file_name = f"{directory}{gene}{source}" + df=comm_filter(file_name,comu) + # Your calculations + min_val = df['count'].min() + max_val = df['count'].max() + median_val = df['count'].median() + first_quartile_val = df['count'].quantile(0.25) + third_quartile_val = df['count'].quantile(0.75) + total=df['count'].sum() + + # Step 1: Total number of rows + total_rows = len(df) + + # Step 2: Number of rows where 'count' is 0 + rows_with_count_zero = (df['count'] == 0).sum() + + # Step 3: Calculate the percentage + zero = (rows_with_count_zero / total_rows) * 100 + + # Create a DataFrame with these values + new_stats = { + 'Min': [min_val], + 'First Quartile': [first_quartile_val], + 'Median': [median_val], + 'Third Quartile': [third_quartile_val], + 'Max': [max_val], + '0%':[zero], + 'total':[total] + } + new_row_df = pd.DataFrame(new_stats, index=[source]) + # Save the DataFrame to a CSV file + stats_df = pd.concat([stats_df, new_row_df]) + stats_df.to_csv('/home/matthaus/Documents/chia-pet_network/results/nonb/V4/stats_summary_prom.csv', index=True,sep="\t") + + + + +for struct, sources in files.items(): + # Define the PDF path outside the community loop, so one PDF is created per structure + pdf_path = f'/home/matthaus/Documents/chia-pet_network/results/nonb/V4/prom/{struct}_densityplot.pdf' + with PdfPages(pdf_path) as pdf: + for community_path, community_name in communities.items(): + # Initialize a list to store dataframes for each source within the current community + sources_dataframes = [] + + for source in sources: + directory = "/home/matthaus/Documents/data/basedo/BED_files/intersect/reshape/" + file_name = f"{directory}{gene}{source}" + + # Assuming plot_all function generates and directly saves the plot to the PDF + # If plot_all is not designed to handle PdfPages, you might need to adjust its implementation + plot_all(file_name, community_path, pdf) + + + +def plot_all_exon(str_df, comu_dict, pdf): + result_df = comm_filter_exon(str_df, comu_dict) + file_basename = os.path.basename(str_df) + structure_name = os.path.splitext(file_basename)[0] + sns.set(style="whitegrid") + communities = result_df['community'].unique() + n_colors = len(communities) + palette = sns.color_palette("coolwarm", n_colors=n_colors) + + fig, axs = plt.subplots(2, 3, figsize=(20, 12)) + + sns.kdeplot(data=result_df, x="count", hue="community", fill=True, common_norm=False, palette=palette, ax=axs[0, 0]) + axs[0, 0].set_title(f"Density Curves of Counts for Each Community {community_name}- {source}") + + sns.kdeplot(data=result_df, x="count_norm", hue="community", fill=True, common_norm=False, palette=palette, ax=axs[0, 1]) + axs[0, 1].set_title(f"Density Curves of Counts Normalised {community_name}- {source}") + + sns.kdeplot(data=result_df, x="length_norm", hue="community", fill=True, common_norm=False, palette=palette, ax=axs[0, 2]) + axs[0, 2].set_title(f"Density Curves of Length Normalised {community_name}- {source}") + + sns.violinplot(data=result_df, x="community", y="count", inner="quartile", palette="Set3", ax=axs[1, 0]) + axs[1, 0].set_title(f"Violin Plot of Count for Each Community {community_name}- {source}") + axs[1, 0].tick_params(axis='x', rotation=45) + + sns.violinplot(data=result_df, x="community", y="count_norm", inner="quartile", palette="Set3", ax=axs[1, 1]) + axs[1, 1].set_title(f"Violin Plot of Counts Normalised {community_name}- {source}") + axs[1, 1].tick_params(axis='x', rotation=45) + + sns.violinplot(data=result_df, x="community", y="length_norm", inner="quartile", palette="Set3", ax=axs[1, 2]) + axs[1, 2].set_title(f"Violin Plot of Length Normalised {community_name}- {source}") + axs[1, 2].tick_params(axis='x', rotation=45) + plt.tight_layout() + pdf.savefig(fig) # Save the current figure into the PDF file + #plt.show() + plt.close() + + + + + + +def comm_filter_exon(str_df, comu_dict): + # Assume exon_com, df_maker_density, and read_community_genes are defined elsewhere and work as expected + exondf = exon_com(comu) # Not clear how 'comu' is defined, assuming it's provided elsewhere + df_struct = df_maker_density(str_df) + dgene = read_community_genes(comu_dict) + #print("exondf:",exondf) + + # Create a new column for the actual gene ID by removing the exon part + df_struct['actual_gene_id'] = df_struct['gene_id'].apply(lambda x: x.split('_')[0]) + # Step 2: Perform the merge + merged_df = pd.merge(df_struct, exondf, left_on='gene_id', right_on='exon_id', how='outer') + # Step 3: Adjust missing values for 'count', 'count_norm', and 'length_norm' + + # Fill missing 'count' with 0 + merged_df['count'] = merged_df['count'].fillna(int(0)) + merged_df['count_norm'] = merged_df['count_norm'].fillna(int(0)) + merged_df['length_norm'] = merged_df['length_norm'].fillna(int(0)) + #print("zgeiohghiogrjio",merged_df) + # Drop the redundant 'gene_id' column if needed (since 'exon_id' and 'gene_id' are essentially the same) + merged_df.drop('gene_id_x', axis=1, inplace=True) + #merged_df.drop('actual_gene_id', axis=1, inplace=True) + merged_df + # Optionally, rename 'exon_id' to 'actual_gene_id' if that's a requirement, but it seems from your description + # that 'actual_gene_id' is already correctly derived in 'df_struct' + # merged_df.rename(columns={'exon_id': 'actual_gene_id'}, inplace=True) + return merged_df + +def exon_presence_plot(str_df, comu_dict): + df_struct = df_maker_density(str_df) # Assuming this function prepares your DataFrame from a structured string + dgene = read_community_genes(comu_dict) # Assuming this function reads gene-community mappings + df_struct['actual_gene_id'] = df_struct['gene_id'].apply(lambda x: x.split('_')[0]) + + dfs = [] + for k, v in dgene.items(): + community = k + kc = df_struct[df_struct['actual_gene_id'].isin(v)].copy() # Make a copy to avoid modifying the original DataFrame + kc['community'] = community # Assigning the community value to the 'community' column + #print(kc) + + dfs.append(kc) # Append the modified DataFrame to the list + result_df = pd.concat(dfs, ignore_index=True) + df_filtered = result_df[['gene_id', 'community']] + df_filtered = df_filtered.rename(columns={'community': source}) + + return df_filtered + + +# Specified order for the first community categories +order_first_community = [ + "Lamina", "Near-Lm2", "Near-Lm1", "Interior-Act3", "Lamina-Like", + "Interior-Repr1", "Interior-Act2", "Interior-Repr2", "Interior-Act1", "Speckle" +] + +# Specified order for the second community categories +order_second_community = [ + "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", + "G11", "G12", "G13", "G14", "G15", "G16", "G17", "G18", "G19", "G20", + "G21", "G22", "G23" +] +z=0 + + +files={ +"G4": ["endoquad_G4.bed","imgrinder_GQ.bed","non-bdb_GQ.bed"], +"IM": ["imgrinder_IM.bed","imseeker_imotif.bed"], +"palindrome": ["palindrome_cruci.bed","non-bdb_cruci.bed"], +"TFO": ["gquadR_TFO.bed"], +"HDNA": ["gquadR_HDNA.bed","non-bdb_triplex.bed"], +"Z": ["gquadR_ZDNA.bed","non-bdb_Z.bed"], +"slipped": ["gquadR_slipped.bed","non-bdb_slipped.bed"], +"Satellite": ["dfam_Satellite.bed"], +"APR": ["non_bdb_APR.bed"], +"STR": ["gquadR_STR.bed","non_bdb_STR.bed"], +"DR": ["non_bdb_DR.bed"], +"IR": ["palindrome.bed","non-bdb_IR.bed"], +"MR": ["non_bdb_MR.bed"] +} + +gene="intersect_exon_" +for community_path, community_name in communities.items(): + ldf = [] + for struct, sources in files.items(): + for source in sources: + directory = "/home/matthaus/Documents/data/basedo/BED_files/intersect/reshape/" + file_name = f"{directory}{gene}{source}" + + ck=exon_presence_plot(file_name,community_path) + ldf.append(ck) # Append the modified DataFrame to the list + + result_df = pd.concat(ldf, ignore_index=True) + df=result_df + print(df) + + # Generate summary DataFrame + # Initialize an empty DataFrame for the summary + summary_df = pd.DataFrame() + + # Iterate through each column except 'gene_id' + for col in df.columns[1:]: + # Count the occurrence of each unique value in the current column + counts = df[col].value_counts().to_frame(name=col) + # If the summary DataFrame is empty, initialize it with the counts + if summary_df.empty: + summary_df = counts + else: + # If summary_df already exists, join the new counts + summary_df = summary_df.join(counts, how='outer') + + # Replace NaN with 0 to indicate no occurrences + summary_df = summary_df.fillna(0) + + # Calculate proportions for each group value within each bed file category + proportions_df = summary_df.div(summary_df.sum(axis=1), axis=0) + + is_first_community = proportions_df.index[0] in order_first_community + + # Reorder 'proportions_df' according to the specified order for the community + if is_first_community: + ordered_index = order_first_community + else: + ordered_index = order_second_community + + # Reindex 'proportions_df' to match the desired order + # Note: Ensure 'proportions_df' index matches the categories mentioned in the order lists + proportions_df = proportions_df.reindex(ordered_index) + + # Calculate the sum of proportions for each group across all categories + sum_proportions = proportions_df.sum(axis=0) + + # Sort the columns (groups) in 'proportions_df' based on the sum of proportions (ascending order so highest is at bottom) + sorted_columns = sum_proportions.sort_values(ascending=False).index + + # Reorder 'proportions_df' columns based on sorted_columns + proportions_df_sorted = proportions_df[sorted_columns] + + # Initialize a Plotly figure + fig = go.Figure() + + # Add a bar for each group value (column) in the sorted 'proportions_df' + for group_value in proportions_df_sorted.columns: + fig.add_trace(go.Bar( + x=proportions_df_sorted.index, # Bed file categories as x-axis + y=proportions_df_sorted[group_value], # Proportions of occurrences for this group value + name=group_value, # Legend name + )) + + # Update layout for a stacked bar plot + fig.update_layout( + barmode='stack', + title='Diversity of Structures Across Groups (Proportions, Ordered)', + xaxis=dict( + title='Bed File Category', + tickangle=-45, # Optionally rotate labels for better readability + ), + yaxis=dict( + title='Proportion of Occurrences', + tickformat=".0%", + ), + legend_title='Group Value', + ) + if z==0: + z+=1 + # Show the figure + fig.show() + fig.write_html("/home/matthaus/Documents/chia-pet_network/results/nonb/exon/Spin_EXON.html") + else: + fig.show() + fig.write_html("/home/matthaus/Documents/chia-pet_network/results/nonb/exon/Com23_EXON.html") + + + + + +for struct, sources in files.items(): + # Define the PDF path outside the community loop, so one PDF is created per structure + pdf_path = f'/home/matthaus/Documents/chia-pet_network/results/nonb/V4/exon/{struct}_densityplot.pdf' + with PdfPages(pdf_path) as pdf: + for community_path, community_name in communities.items(): + # Initialize a list to store dataframes for each source within the current community + sources_dataframes = [] + + for source in sources: + directory = "/home/matthaus/Documents/data/basedo/BED_files/intersect/reshape/" + file_name = f"{directory}{gene}{source}" + # Assuming plot_all function generates and directly saves the plot to the PDF + # If plot_all is not designed to handle PdfPages, you might need to adjust its implementation + #plot_all_exon(file_name, community_path, pdf) + +files={ +"G4": ["endoquad_G4.bed","imgrinder_GQ.bed","non-bdb_GQ.bed"], +"IM": ["imgrinder_IM.bed","imseeker_imotif.bed"], +"palindrome": ["palindrome_cruci.bed","non-bdb_cruci.bed"], +"TFO": ["gquadR_TFO.bed"], +"HDNA": ["gquadR_HDNA.bed","non-bdb_triplex.bed"], +"Z": ["gquadR_ZDNA.bed","non-bdb_Z.bed"], +"slipped": ["gquadR_slipped.bed","non-bdb_slipped.bed"], +"Satellite": ["dfam_Satellite.bed"], +"APR": ["non_bdb_APR.bed"], +"STR": ["gquadR_STR.bed","non_bdb_STR.bed"], +"DR": ["non_bdb_DR.bed"], +"IR": ["palindrome.bed","non-bdb_IR.bed"], +"MR": ["non_bdb_MR.bed"] +} +stats_df=pd.DataFrame() +for struct, sources in files.items(): + for source in sources: + directory = "/home/matthaus/Documents/data/basedo/BED_files/intersect/reshape/" + file_name = f"{directory}{gene}{source}" + df=comm_filter_exon(file_name,comu) + # Your calculations + min_val = df['count'].min() + max_val = df['count'].max() + median_val = df['count'].median() + first_quartile_val = df['count'].quantile(0.25) + third_quartile_val = df['count'].quantile(0.75) + # Step 1: Total number of rows + total_rows = len(df) + total=df['count'].sum() + + # Step 2: Number of rows where 'count' is 0 + rows_with_count_zero = (df['count'] == 0).sum() + + # Step 3: Calculate the percentage + zero = (rows_with_count_zero / total_rows) * 100 + + # Create a DataFrame with these values + new_stats = { + 'Min': [min_val], + 'First Quartile': [first_quartile_val], + 'Median': [median_val], + 'Third Quartile': [third_quartile_val], + 'Max': [max_val], + '0%':[zero], + 'total':[total] + } + new_row_df = pd.DataFrame(new_stats, index=[source]) + # Save the DataFrame to a CSV file + stats_df = pd.concat([stats_df, new_row_df]) + stats_df.to_csv('/home/matthaus/Documents/chia-pet_network/results/nonb/V4/stats_summary_exon.csv', index=True,sep="\t") + diff --git a/src/non-b/genelist-parser_bed-creator.py b/src/non-b/genelist-parser_bed-creator.py deleted file mode 100644 index ceb3cccba5c819bb658a16dfdf1d5d5ffb652d4b..0000000000000000000000000000000000000000 --- a/src/non-b/genelist-parser_bed-creator.py +++ /dev/null @@ -1,56 +0,0 @@ -import argparse -import os -from collections import defaultdict - -def read_community_genes(filename): - dgene = defaultdict(list) - - with open(filename, 'r') as file: - nbli = 0 - for line in file: - if nbli >= 1: - linesplit = line.split("\t") - gene_ids = linesplit[2].split(",") - for gene_id in gene_ids: - dgene[linesplit[0]].append(gene_id.strip()) - nbli += 1 - - return dgene - -def filter_bed_file(bed_filename, gene_ids): - filtered_genes = [] - - with open(bed_filename, 'r') as bed_file: - for line in bed_file: - gene_id = line.split()[3] - if gene_id in gene_ids: - filtered_genes.append(line.strip()) - - return filtered_genes - -def main(): - parser = argparse.ArgumentParser(description='Filter .bed files using gene IDs extracted from input files.') - parser.add_argument('community_file', type=str, help='Path to the community gene file') - parser.add_argument('bed_file', type=str, help='.bed file to filter') - - args = parser.parse_args() - - # Create a directory based on the community file name - output_dir = os.path.splitext(args.community_file)[0] + "_output" - os.makedirs(output_dir, exist_ok=True) - - community_genes = read_community_genes(args.community_file) - - for community, gene_ids in community_genes.items(): - filtered_genes = filter_bed_file(args.bed_file, gene_ids) - - # Create unique filename based on community name - output_filename = os.path.join(output_dir, f"{community}_filtered.bed") - with open(output_filename, 'w') as output_file: - for gene in filtered_genes: - output_file.write(gene + '\n') - - print(f"Filtered genes from {args.bed_file} using gene IDs from {community} community saved to {output_filename}") - -if __name__ == "__main__": - main() diff --git a/src/non-b/heatmap_TF.py b/src/non-b/heatmap_TF.py new file mode 100644 index 0000000000000000000000000000000000000000..8603869d0ce61bfa735bd895e7de43de4749e4ad --- /dev/null +++ b/src/non-b/heatmap_TF.py @@ -0,0 +1,210 @@ +from collections import defaultdict + +import pandas as pd + +import sys + +import numpy as np + +import re + +import random + +def read_community_genes(filename): + dgene = defaultdict(list) + + with open(filename, 'r') as file: + nbli = 0 + for line in file: + if nbli >= 1: + linesplit = line.split("\t") + if int(linesplit[1]) >= 10: + gene_ids = linesplit[8].split(",") + for gene_id in gene_ids: + dgene[linesplit[0]].append(gene_id.strip()) + nbli += 1 + print(dgene) + return dgene + + +def df_maker(file_path): + # Read the file into a DataFrame + df = pd.read_csv(file_path, sep='\t', header=None) + + # Set column names + if df.shape[1] == 11: + df.columns = ['chr_start', 'start', 'end', 'gene_id', 'gene_name', 'strand', + 'chr_end', 'structure_start', 'structure_end', 'struct', 'sequ'] + elif df.shape[1] == 12: + df.columns = ['chr_start', 'start', 'end', 'gene_id', 'gene_name', 'strand', + 'chr_end', 'structure_start', 'structure_end', 'struct', 'score', 'sequ'] + elif df.shape[1] == 13: + df.columns = ['chr_start', 'start', 'end', 'gene_id', 'gene_name', 'strand','chr_end', 'structure_start', + 'structure_end', 'struct', 'score', 'structure_strand', 'sequ'] + + # Calculate gene length and structure length + df['gene_length'] = df['end'] - df['start'] + 1 + df['structure_length'] = df['structure_end'] - df['structure_start'] + 1 + + # Convert 'sequ' to lowercase to count nucleotides in a case-insensitive manner + df['sequ_lower'] = df['sequ'].str.lower() + + # Count nucleotides + df['A_count'] = df['sequ_lower'].str.count('a') + df['C_count'] = df['sequ_lower'].str.count('c') + df['G_count'] = df['sequ_lower'].str.count('g') + df['T_count'] = df['sequ_lower'].str.count('t') + + # Calculate total nucleotides for each sequence + df['total_nucleotides'] = df['A_count'] + df['C_count'] + df['G_count'] + df['T_count'] + + # Calculate percentage of each nucleotide + df['A_percent'] = (df['A_count'] / df['total_nucleotides']) * 100 + df['C_percent'] = (df['C_count'] / df['total_nucleotides']) * 100 + df['G_percent'] = (df['G_count'] / df['total_nucleotides']) * 100 + df['T_percent'] = (df['T_count'] / df['total_nucleotides']) * 100 + + # Continue with your existing grouping and merging logic... + grouped_df = df.groupby('gene_id').size().reset_index(name='count') + grouped_df1 = df.groupby('gene_id')['structure_length'].sum().reset_index() + grouped_gene_length = df.groupby('gene_id')['gene_length'].sum().reset_index() + grouped_nucleotide_percents = df.groupby('gene_id')[['A_percent', 'C_percent', 'G_percent', 'T_percent']].mean().reset_index() + + # Merge the grouped DataFrames + concatenated_df = pd.merge(grouped_df, grouped_df1, on='gene_id') + concatenated_df = pd.merge(concatenated_df, grouped_gene_length, on='gene_id') + concatenated_df = pd.merge(concatenated_df, grouped_nucleotide_percents, on='gene_id') + + # Other calculations... + concatenated_df['ratio%_struct'] = (concatenated_df['structure_length'] / concatenated_df['gene_length']) * 100 + min_ratio = concatenated_df['ratio%_struct'].min() + max_ratio = concatenated_df['ratio%_struct'].max() + concatenated_df['norm'] = 10 * ((concatenated_df['ratio%_struct'] - min_ratio) / (max_ratio - min_ratio)) + + # Final adjustments + final_df = concatenated_df.drop(columns=['gene_length', 'structure_length']) + return final_df + + +def comm_filter(str_df, comu_dict): + df_struct = df_maker(str_df) + dgene = read_community_genes(comu_dict) + + dfs = [] + for k, v in dgene.items(): + community = k + kc = df_struct[df_struct['gene_id'].isin(v)].copy() # Make a copy to avoid modifying the original DataFrame + kc['community'] = community # Assigning the community value to the 'community' column + dfs.append(kc) # Append the modified DataFrame to the list + + result_df = pd.concat(dfs, ignore_index=True) + + # Calculate the threshold as the count value at the 90th percentile + threshold = result_df['count'].quantile(0.9) + + # Drop rows with count values higher than the threshold + result_df = result_df[result_df['count'] <= threshold] + + return result_df + +def nucleotide_sum_by_community(str_file, commu_file): + result_df = comm_filter(str_file, commu_file) + print(result_df) + # Group by 'community' and sum nucleotide counts + nucleotide_sums = result_df.groupby('community')[['A_percent', 'C_percent', 'G_percent', 'T_percent']].mean().reset_index() + counts_sum = result_df.groupby('community')['count'].sum().reset_index(name='count') + nucleotide_sums = pd.merge(nucleotide_sums, counts_sum, on='community') + num_unique_communities = len(nucleotide_sums['community']) + print(num_unique_communities) + + if num_unique_communities == 10: + # Define the desired community order + community_order = ['Lamina','Near-Lm2', 'Near-Lm1','Interior-Act3', 'Lamina-Like','Interior-Repr1', 'Interior-Act2', + 'Interior-Repr2', 'Interior-Act1', 'Speckle'] + # Set the 'community' column to a categorical type with the specified order + nucleotide_sums['community'] = pd.Categorical(nucleotide_sums['community'], categories=community_order, ordered=True) + + elif num_unique_communities == 23: + # Define the desired community order + community_order = ['G1','G2','G3','G4','G5','G6','G7','G8','G9','G10','G11','G12','G13','G14','G15','G16','G17','G18','G19','G20','G21','G22','G23'] + # Set the 'community' column to a categorical type with the specified order + nucleotide_sums['community'] = pd.Categorical(nucleotide_sums['community'], categories=community_order, ordered=True) + + # Sort the DataFrame by the 'community' column according to the defined order + nucleotide_sums = nucleotide_sums.sort_values('community') + + # Modify the community names by appending the count values + nucleotide_sums['community'] = nucleotide_sums['community'].astype(str) + "_" + nucleotide_sums['count'].astype(str) + + # Remove the 'count' column as it's no longer needed + nucleotide_sums.drop('count', axis=1, inplace=True) + + print(nucleotide_sums) + return nucleotide_sums + + +def generate_gene_communities(annotation_bedfile, outputfile_path): + # Read the BED file and extract the 4th column containing gene IDs + df = pd.read_csv(annotation_bedfile, sep='\t', header=None, usecols=[3]) + unique_genes = list(df[3].unique()) + + communities_data = [] # To collect community data + + for community_count in range(1, 16): # Limit to 15 communities + if len(unique_genes) < 600: + break # Break if fewer than 600 genes are left + community_size = random.randint(600, min(1200, len(unique_genes))) + selected_genes = random.sample(unique_genes, community_size) + + # Remove selected genes from the unique_genes list + unique_genes = [gene for gene in unique_genes if gene not in selected_genes] + + communities_data.append({ + 'community': f'RdM{community_count}', + 'nodes': len(selected_genes), + 'genes': ','.join(selected_genes) + }) + + communities_df = pd.DataFrame(communities_data) + communities_df.to_csv(outputfile_path, sep='\t', index=False) + + + +#generate_gene_communities('/home/matthaus/Documents/data/annot/genes.bed', '/home/matthaus/Documents/data/community/Rdcommu.txt') + + +community="/home/matthaus/Documents/data/community_SPIN.txt" +community23="/home/matthaus/Documents/data/community/dendro_gene_nt_ward_euclidean_tr50.0_Sordered.txt" + + +lamina="/home/matthaus/Téléchargements/community_counts_Lamina_strong_clean_0.0_w2_weigthed.txt" +speck="/home/matthaus/Téléchargements/community_counts_Speckle_clean_0.0_w2_weigthed.txt" +rdblu="Picnic" +from .heatmap_gene_content import heatmap_fig + +rand='/home/matthaus/Documents/data/community/Rdcommu.txt' + +for link in links: + # Use the link as the first argument in nucleotide_sum_by_community + df = nucleotide_sum_by_community(link, community) + dfrand = nucleotide_sum_by_community(link, rand) + df23 = nucleotide_sum_by_community(link, community23) # Assuming 'community' is defined somewhere + df = df.set_index('community') + dfrand = dfrand.set_index('community') + df23 = df23.set_index('community') + + # Extract the specific part of the link for the output filename + match = re.search(r"intersect_(.*?)\.bed", link) + name_part = match.group(1) # This captures the string between "intersect_" and ".bed" + + # Generate the heatmap for df with a specific naming convention + output_path_df = f"./results/nonb/heatmap_{name_part}" + heatmap_fig(df, output_path_df, rdblu, False, True, 2, None) + + # Generate the heatmap for dfrand with a specific naming convention + output_path_dfrand = f"./results/nonb/heatmapRand_{name_part}" + heatmap_fig(dfrand, output_path_dfrand, rdblu, False, True, 2, None) + + # Generate the heatmap for df23 with a specific naming convention + output_path_df23 = f"./results/nonb/heatmap23_{name_part}" + heatmap_fig(df23, output_path_df23, rdblu, False, True, 2, None) diff --git a/src/non-b/str_intersect_dblbed.sh b/src/non-b/intersect.sh similarity index 55% rename from src/non-b/str_intersect_dblbed.sh rename to src/non-b/intersect.sh index 23afb83e74055c9289f2d16517de05d3533885bc..5fb279b510bc14e28ece0c155683035e4bbc41a3 100644 --- a/src/non-b/str_intersect_dblbed.sh +++ b/src/non-b/intersect.sh @@ -1,15 +1,16 @@ #!/bin/bash # Check if two arguments are provided -if [ "$#" -ne 2 ]; then - echo "Usage: $0 annotation.bed structure.bed +if [ "$#" -ne 3 ]; then + echo "Usage: $0 annotation.bed structure.bed outpath need to be in a bedtools environement" exit 1 fi - +# structures=("DZDNA_HG38.bed" "gquadR_ZDNA.bed" "gquadR_slipped.bed" "gquadR_TFO.bed" "IMSEEKER.bed" "PiMS2a.bed" "triplex.bed" "palindromes_cruci.bed" "Repeatmasker_Satellite.bed" "dfam_Satellite.bed" "gquadR_APR.bed" "palindromes.bed" "gquadR_STR.bed") +# # Assign input file names annotation_bed="$1" structure_bed="$2" - +out="$3" # Extract name of the structure file structure_name=$(basename "$structure_bed" | cut -d. -f1) @@ -17,7 +18,7 @@ structure_name=$(basename "$structure_bed" | cut -d. -f1) output_dir=$(dirname "$structure_bed") # Perform bedtools intersect -output_file="${output_dir}/intersect_${structure_name}.bed" +output_file=${out}"intersect_"${structure_name}".bed" bedtools intersect -wa -wb -F 1 -a "$annotation_bed" -b "$structure_bed" > "$output_file" echo "Intersect output saved to: $output_file" diff --git a/src/non-b/plot_generator.py b/src/non-b/plot_generator.py deleted file mode 100644 index d1d700df1e3f0d6359d12635f28d560cf11e813e..0000000000000000000000000000000000000000 --- a/src/non-b/plot_generator.py +++ /dev/null @@ -1,354 +0,0 @@ - -from str_stats import comm_filter -from str_stats import comm_filter_norm -import os - -import seaborn as sns -import matplotlib.pyplot as plt -from matplotlib.patches import Patch - - -def plot_all(str_df, comu_dict): - from str_stats import comm_filter_norm - - # Filter data once - result_df_norm = comm_filter_norm(str_df, comu_dict) - result_df = comm_filter(str_df, comu_dict) - # Extract base name of the file - file_basename = os.path.basename(str_df) - structure_name = os.path.splitext(file_basename)[0] # Remove extension - - # Set seaborn style - sns.set(style="whitegrid") - - # Get unique communities - communities = result_df['community'].unique() - - # Create a gradient color palette from blue to red - n_colors = len(communities) - palette = sns.color_palette("coolwarm", n_colors=n_colors) - - - - - -# Plot density curves for each community - plt.figure(figsize=(10, 6)) # Adjust figure size as needed - ax = sns.kdeplot(data=result_df, x="count", hue="community", fill=True, common_norm=False, palette=palette,bw_adjust=.25) - - - # Add title and labels - plt.title(f"Density Curves of Count for Each Community - {structure_name}") - plt.xlabel("Count") - plt.ylabel("Density") - - # Create custom legend with community names and gradient colors - legend_elements = [Patch(facecolor=color, label=community) for community, color in zip(communities, palette)] - plt.legend(handles=legend_elements, title="Community", bbox_to_anchor=(1.05, 1), loc='upper left') - plt.tight_layout() - # Show plot - #plt.show() - plt.savefig(f"/home/matthaus/Images/plots/{structure_name}_density_curves_23.png", dpi=300) # Adjust file name and DPI as needed - plt.close() - - -# Plot violin plots for each community - plt.figure(figsize=(10, 6)) # Adjust figure size as needed - sns.violinplot(data=result_df, x="community", y="count", inner="quartile", palette="Set3") - - # Add title and labels - plt.title(f"Violin Plot of Count for Each Community- {structure_name}") - plt.xlabel("Community") - plt.ylabel("Count") - - # Rotate x-axis labels for better readability - plt.xticks(rotation=45) - plt.tight_layout() - - # Show plot - #plt.show() - plt.savefig(f"/home/matthaus/Images/plots/{structure_name}_violin_23.png", dpi=300) # Adjust file name and DPI as needed - plt.close() - -# Plot violin plots norm for each community - plt.figure(figsize=(10, 6)) # Adjust figure size as needed - sns.violinplot(data=result_df_norm, x="community", y="norm", inner="quartile", palette="Set3") - - # Add title and labels - plt.title(f"Violin Plot of Count for Each Community- {structure_name}") - plt.xlabel("Community") - plt.ylabel("struct_length/gene_length_0-10_scale") - - # Rotate x-axis labels for better readability - plt.xticks(rotation=45) - plt.tight_layout() - - # Show plot - #plt.show() - plt.savefig(f"/home/matthaus/Images/plots/{structure_name}_violin_norm_23.png", dpi=300) # Adjust file name and DPI as needed - plt.close() - -# Plot density curves for each community - - plt.figure(figsize=(10, 6)) # Adjust figure size as needed - ax = sns.kdeplot(data=result_df_norm, x="norm", hue="community", fill=True, common_norm=False, palette=palette,bw_adjust=.25) - - - # Add title and labels - plt.title(f"Density Curves of Count for Each Community - {structure_name}") - plt.xlabel("struct_length/gene_length_0-10_scale") - plt.ylabel("Density") - - # Create custom legend with community names and gradient colors - legend_elements = [Patch(facecolor=color, label=community) for community, color in zip(communities, palette)] - plt.legend(handles=legend_elements, title="Community", bbox_to_anchor=(1.05, 1), loc='upper left') - plt.tight_layout() - # Show plot - #plt.show() - plt.savefig(f"/home/matthaus/Images/plots/{structure_name}_density_curves_norm_23.png", dpi=300) # Adjust file name and DPI as needed - plt.close() - -def plot_density_curves(str_df, comu_dict): - from str_stats import comm_filter - result_df=comm_filter(str_df, comu_dict) - # Extract base name of the file - file_basename = os.path.basename(str_df) - structure_name = os.path.splitext(file_basename)[0] # Remove extension - # Set seaborn style - sns.set(style="whitegrid") - - # Get unique communities - communities = result_df['community'].unique() - - # Create a gradient color palette from blue to red - n_colors = len(communities) - palette = sns.color_palette("coolwarm", n_colors=n_colors) - - # Plot density curves for each community - plt.figure(figsize=(10, 6)) # Adjust figure size as needed - ax = sns.kdeplot(data=result_df, x="count", hue="community", fill=True, common_norm=False, palette=palette,bw_adjust=.25) - - - # Add title and labels - plt.title(f"Density Curves of Count for Each Community - {structure_name}") - plt.xlabel("Count") - plt.ylabel("Density") - - # Create custom legend with community names and gradient colors - legend_elements = [Patch(facecolor=color, label=community) for community, color in zip(communities, palette)] - plt.legend(handles=legend_elements, title="Community", bbox_to_anchor=(1.05, 1), loc='upper left') - plt.tight_layout() - # Show plot - #plt.show() - plt.savefig(f"/home/matthaus/Images/plots/{structure_name}_density_curvesG.png", dpi=300) # Adjust file name and DPI as needed - -def plot_violin(str_df, comu_dict): - from str_stats import comm_filter - result_df=comm_filter(str_df, comu_dict) - file_basename = os.path.basename(str_df) - structure_name = os.path.splitext(file_basename)[0] # Remove extension - sns.set(style="whitegrid") - - # Plot violin plots for each community - plt.figure(figsize=(10, 6)) # Adjust figure size as needed - sns.violinplot(data=result_df, x="community", y="count", inner="quartile", palette="Set3") - - # Add title and labels - plt.title(f"Violin Plot of Count for Each Community- {structure_name}") - plt.xlabel("Community") - plt.ylabel("Count") - - # Rotate x-axis labels for better readability - plt.xticks(rotation=45) - plt.tight_layout() - - # Show plot - #plt.show() - plt.savefig(f"/home/matthaus/Images/plots/{structure_name}_violinG.png", dpi=300) # Adjust file name and DPI as needed - -def plot_violin_norm(str_df, comu_dict): - from str_stats import comm_filter_norm - result_df=comm_filter_norm(str_df, comu_dict) - file_basename = os.path.basename(str_df) - structure_name = os.path.splitext(file_basename)[0] # Remove extension - sns.set(style="whitegrid") - - # Plot violin plots for each community - plt.figure(figsize=(10, 6)) # Adjust figure size as needed - sns.violinplot(data=result_df, x="community", y="norm", inner="quartile", palette="Set3") - - # Add title and labels - plt.title(f"Violin Plot of Count for Each Community- {structure_name}") - plt.xlabel("Community") - plt.ylabel("struct_length/gene_length_0-10_scale") - - # Rotate x-axis labels for better readability - plt.xticks(rotation=45) - plt.tight_layout() - - # Show plot - #plt.show() - plt.savefig(f"/home/matthaus/Images/plots/{structure_name}_violin_normG.png", dpi=300) # Adjust file name and DPI as needed - -def plot_density_curves_norm(str_df, comu_dict): - from str_stats import comm_filter_norm - result_df=comm_filter_norm(str_df, comu_dict) - file_basename = os.path.basename(str_df) - structure_name = os.path.splitext(file_basename)[0] # Remove extension - # Set seaborn style - sns.set(style="whitegrid") - - # Get unique communities - communities = result_df['community'].unique() - - # Create a gradient color palette from blue to red - n_colors = len(communities) - palette = sns.color_palette("coolwarm", n_colors=n_colors) - - # Plot density curves for each community - plt.figure(figsize=(10, 6)) # Adjust figure size as needed,bw_adjust=.25 - plt.ylabel("Density") - - # Create custom legend with community names and gradient colors - legend_elements = [Patch(facecolor=color, label=community) for community, color in zip(communities, palette)] - # Position legend outside the plot area - plt.legend(handles=legend_elements, title="Community", bbox_to_anchor=(1.05, 1), loc='upper left') - plt.tight_layout() - # Show plot - #plt.show() - plt.savefig(f"/home/matthaus/Images/plots/{structure_name}_density_curves_normG.png", dpi=300) # Adjust file name and DPI as needed - -def plot_all_show(str_df, comu_dict): - from str_stats import comm_filter_norm - - # Filter data once - result_df_norm = comm_filter_norm(str_df, comu_dict) - result_df = comm_filter(str_df, comu_dict) - # Extract base name of the file - file_basename = os.path.basename(str_df) - structure_name = os.path.splitext(file_basename)[0] # Remove extension - - # Set seaborn style - sns.set(style="whitegrid") - - # Get unique communities - communities = result_df['community'].unique() - - # Create a gradient color palette from blue to red - n_colors = len(communities) - palette = sns.color_palette("coolwarm", n_colors=n_colors) - - - - - -# Plot density curves for each community - plt.figure(figsize=(10, 6)) # Adjust figure size as needed - ax = sns.kdeplot(data=result_df, x="count", hue="community", fill=True, common_norm=False, palette=palette,bw_adjust=.25) - - - # Add title and labels - plt.title(f"Density Curves of Count for Each Community - {structure_name}") - plt.xlabel("Count") - plt.ylabel("Density") - - # Create custom legend with community names and gradient colors - legend_elements = [Patch(facecolor=color, label=community) for community, color in zip(communities, palette)] - plt.legend(handles=legend_elements, title="Community", bbox_to_anchor=(1.05, 1), loc='upper left') - plt.tight_layout() - # Show plot - #plt.show() - plt.show() # Adjust file name and DPI as needed - plt.close() - - -# Plot violin plots for each community - plt.figure(figsize=(10, 6)) # Adjust figure size as needed - sns.violinplot(data=result_df, x="community", y="count", inner="quartile", palette="Set3") - - # Add title and labels - plt.title(f"Violin Plot of Count for Each Community- {structure_name}") - plt.xlabel("Community") - plt.ylabel("Count") - - # Rotate x-axis labels for better readability - plt.xticks(rotation=45) - plt.tight_layout() - - # Show plot - #plt.show() - plt.show() - plt.close() - -# Plot violin plots norm for each community - plt.figure(figsize=(10, 6)) # Adjust figure size as needed - sns.violinplot(data=result_df_norm, x="community", y="norm", inner="quartile", palette="Set3") - - # Add title and labels - plt.title(f"Violin Plot of Count for Each Community- {structure_name}") - plt.xlabel("Community") - plt.ylabel("struct_length/gene_length_0-10_scale") - - # Rotate x-axis labels for better readability - plt.xticks(rotation=45) - plt.tight_layout() - - # Show plot - #plt.show() - plt.show() - plt.close() - -# Plot density curves for each community - - plt.figure(figsize=(10, 6)) # Adjust figure size as needed - ax = sns.kdeplot(data=result_df_norm, x="norm", hue="community", fill=True, common_norm=False, palette=palette,bw_adjust=.25) - - - # Add title and labels - plt.title(f"Density Curves of Count for Each Community - {structure_name}") - plt.xlabel("struct_length/gene_length_0-10_scale") - plt.ylabel("Density") - - # Create custom legend with community names and gradient colors - legend_elements = [Patch(facecolor=color, label=community) for community, color in zip(communities, palette)] - plt.legend(handles=legend_elements, title="Community", bbox_to_anchor=(1.05, 1), loc='upper left') - plt.tight_layout() - # Show plot - #plt.show() - plt.show() - plt.close() - -''' -#doit etre améliorer pour tout metre dans une deff et pas recharger le df a chaque fois -alu="../../../data/basedo/repeat/Human.RepeatMasker.map/test11.bed" -community="/home/matthaus/Documents/data/community_SPIN.txt" -plot_all_show(alu,community) - - -community='/home/matthaus/Documents/data/community_SPIN.txt' - -community='/home/matthaus/Documents/data/community/community_gene_G.txt' - -triplex='/home/matthaus/Documents/data/intersect_tests/triplex.bed' -gquad='/home/matthaus/Documents/data/intersect_tests/G4.bed' -egquad='/home/matthaus/Documents/data/intersect_tests/EG4.bed' -zdna='/home/matthaus/Documents/data/intersect_tests/ZDNA.bed' - - -nb_apr='/home/matthaus/Documents/data/NON-B_DB_hg19/Structures/Non_B_DB/hg38/4col/intersect/APR.bed' -nb_dr='/home/matthaus/Documents/data/NON-B_DB_hg19/Structures/Non_B_DB/hg38/4col/intersect/DR.bed' -nb_gq='/home/matthaus/Documents/data/NON-B_DB_hg19/Structures/Non_B_DB/hg38/4col/intersect/GQ.bed' -nb_ir='/home/matthaus/Documents/data/NON-B_DB_hg19/Structures/Non_B_DB/hg38/4col/intersect/IR.bed' -nb_mr='/home/matthaus/Documents/data/NON-B_DB_hg19/Structures/Non_B_DB/hg38/4col/intersect/MR.bed' -nb_str='/home/matthaus/Documents/data/NON-B_DB_hg19/Structures/Non_B_DB/hg38/4col/intersect/STR.bed' -nb_z='/home/matthaus/Documents/data/NON-B_DB_hg19/Structures/Non_B_DB/hg38/4col/intersect/Z.bed' - -lstr=[gquad,egquad,zdna,nb_apr,nb_gq,nb_z] - -for i in lstr: - plot_density_curves(i,community) - plot_density_curves_norm(i,community) - plot_violin(i,community) - plot_violin_norm(i,community) - -''' \ No newline at end of file diff --git a/src/non-b/str_stats.py b/src/non-b/str_stats.py deleted file mode 100644 index 9a915d4dcbe996183a2bfb51b1a24226059be106..0000000000000000000000000000000000000000 --- a/src/non-b/str_stats.py +++ /dev/null @@ -1,101 +0,0 @@ -import pandas as pd - -from collections import defaultdict - -def df_maker(file_path): - # Read the file into a DataFrame - df = pd.read_csv(file_path, sep='\t', header=None) - - # Set column names - if df.shape[1] == 10: - df.columns = ['chr_start', 'start', 'end', 'gene_id', 'gene_name', 'strand', - 'chr_end', 'structure_start', 'structure_end', 'struct'] - if df.shape[1] == 12: - df.columns = ['chr_start', 'start', 'end', 'gene_id', 'gene_name', 'strand', - 'chr_end', 'structure_start', 'structure_end', 'struct', 'score', 'structure_strand'] - - # Define functions to calculate gene length and structure length - def calculate_gene_length(row): - return row['end'] - row['start'] + 1 - - def calculate_structure_length(row): - return row['structure_end'] - row['structure_start'] + 1 - - # Apply the functions to each row and store the results in new columns - df['gene_length'] = df.apply(calculate_gene_length, axis=1) - df['structure_length'] = df.apply(calculate_structure_length, axis=1) - - # Group by 'gene_id' and count occurrences, then reset index to turn 'gene_id' into a column - grouped_df = df.groupby('gene_id').size().reset_index(name='count') - grouped_df1 = df.groupby('gene_id')['structure_length'].sum().reset_index() - grouped_gene_length = df.groupby('gene_id')['gene_length'].sum().reset_index() - - # Merge the grouped DataFrames - concatenated_df = pd.merge(grouped_df, grouped_df1, on='gene_id') - concatenated_df = pd.merge(concatenated_df, grouped_gene_length, on='gene_id') - - # Calculate the ratio of structure_length to gene_length multiplied by 100 - concatenated_df['ratio%_struct'] = (concatenated_df['structure_length'] / concatenated_df['gene_length']) * 100 - # Normalize the 'ratio%_struct' column between 0 and 10 - min_ratio = concatenated_df['ratio%_struct'].min() - max_ratio = concatenated_df['ratio%_struct'].max() - concatenated_df['norm'] = 10 * ((concatenated_df['ratio%_struct'] - min_ratio) / (max_ratio - min_ratio)) - # Drop unnecessary columns - final_df = concatenated_df.drop(columns=['gene_length', 'structure_length']) - - return final_df - - -def read_community_genes(filename): - dgene = defaultdict(list) - - with open(filename, 'r') as file: - nbli = 0 - for line in file: - if nbli >= 1: - linesplit = line.split("\t") - gene_ids = linesplit[2].split(",") - for gene_id in gene_ids: - dgene[linesplit[0]].append(gene_id.strip()) - nbli += 1 - - return dgene - - -def comm_filter(str_df, comu_dict): - df_struct = df_maker(str_df) - dgene = read_community_genes(comu_dict) - - dfs = [] - for k, v in dgene.items(): - community = k - kc = df_struct[df_struct['gene_id'].isin(v)].copy() # Make a copy to avoid modifying the original DataFrame - kc['community'] = community # Assigning the community value to the 'community' column - dfs.append(kc) # Append the modified DataFrame to the list - - result_df = pd.concat(dfs, ignore_index=True) - - # Calculate the threshold as the count value at the 90th percentile - threshold = result_df['count'].quantile(0.9) - - # Drop rows with count values higher than the threshold - result_df = result_df[result_df['count'] <= threshold] - - return result_df - -def comm_filter_norm(str_df, comu_dict): - from str_stats import df_maker - from str_stats import read_community_genes - - df_struct=df_maker(str_df) - dgene=read_community_genes(comu_dict) - - dfs = [] - for k, v in dgene.items(): - community = k - kc = df_struct[df_struct['gene_id'].isin(v)].copy() # Make a copy to avoid modifying the original DataFrame - kc['community'] = community # Assigning the community value to the 'community' column - dfs.append(kc) # Append the modified DataFrame to the list - - result_df = pd.concat(dfs, ignore_index=True) - return result_df \ No newline at end of file