diff --git a/README.md b/README.md index c19ca6fffe88d91b2d5b420dbd9458eddd26365c..2421aa5ef3460b20cccc13cdd90522ff7ec16cd6 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,10 @@ ## Description ## Prerequisites -1. This project is coded in ```python3.8``` and uses the following modules: - * SQLite 3 +1. This project is coded in ```python3.8``` and uses the modules listed in `requirements.txt`. To easily install them with pip, enter in your terminal: +```sh +pip install -r requirements.txt +``` 2. [PmagicGEO](https://gitbio.ens-lyon.fr/LBMC/regards/pmagicgeo) - a script that allows to obtain metadata associated to a GSE or a GSM ([GEO classification](https://www.ncbi.nlm.nih.gov/geo/)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..3864ef2f61a3af5e388b7b86d033f9e9adeac38f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +pandas==0.25.3 +biopython==1.75 +tqdm +numpy +regex==2020.2.20 +lazyparser==0.2.0 +coloredlogs==10.0 diff --git a/src/db_utils/create_freq_table.py b/src/db_utils/create_freq_table.py deleted file mode 100755 index a3a8117ce4fd21ecbfc5e9a2ddb06b6389e6dab6..0000000000000000000000000000000000000000 --- a/src/db_utils/create_freq_table.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 - -# -*- coding: utf-8 -*- - -""" -Description: The goal of this script is to create the tables of frequencies \ -that will be next injected into the web database -""" - - -from pathlib import Path -import pandas as pd -from .creation_of_exon_table import get_ctrl_exon -from .config import Config, logging_def -import logging -import sys - - -def load_bed(bed_file: Path) -> pd.DataFrame: - """ - Load a bed file. - - :param bed_file: A bed file containing exons with a 7th column \ - corresponding to frequencies of in the exon. - :return: the bed file as a dataframe - """ - if "orf" in bed_file.name: - mtype = "orf" - else: - mtype = "exon" - names = ["chr", "start", "stop", "name", "score", "strand", "freq"] - df = pd.read_csv(bed_file, sep="\t", names=names) - df = df.loc[:, ["name", "freq"]] - df["type"] = [mtype] * len(df) - df.freq = df.freq.apply(eval) - return df - - -def get_ft_type(bed_type: str, feature: str) -> str: - """ - Get the feature type of a feature. - - :param bed_type: The type of a bed : orf or exon - :param feature: The feature of interest - :return: The feature type - """ - if bed_type == "orf": - if len(feature) == 1: - return "aa" - else: - return "ft" - else: - if len(feature) == 1: - return "nt" - elif len(feature) == 2: - return "dnt" - else: - return "tnt" - - -def create_table(df: pd.DataFrame) -> pd.DataFrame: - """ - Create the table of that that will be inserted in gin database. - - :param df: A dataframe of exon and the aa/ft frequencies \ - of their encoded peptide and their nt/dnt/tnt frequencies - :return: A dataframe - """ - dic = {"ft": [], "frequency": [], "exon_id": [], "ft_type": []} - tot = len(df) - for i in range(len(df)): - if round(i / tot * 100) in range(101): - sys.stdout.write(f"Progression : " - f" {round((i + 1) / (tot / 100))} % \r") - s = df.iloc[i, :] - for ft in s.loc["freq"].keys(): - ft_type = get_ft_type(s.loc["type"], ft) - dic["ft"].append(ft) - dic["frequency"].append(s.loc["freq"][ft]) - dic["exon_id"].append(s.loc["name"]) - dic["ft_type"].append(ft_type) - del df - table = pd.DataFrame(dic) - return table[["ft_type", "ft", "exon_id", "frequency"]].reset_index() - - -def create_freq_table(bed_orf: Path, bed_exon: Path, ctrl_exon: Path, - logging_level: str = "DISABLE") -> None: - """ - From to bed files with as 7th column corresponding to frequency of an \ - exon compute the table that will be used to populate the gin database. - - :param bed_orf: A bed file containing only ORF for exons and \ - a 7th column corresponding to the frequency in amino acid or \ - feature in the peptide coded by the exons - :param bed_exon: A bed file containing exons with a 7th column \ - containing the frequency of nucleotides, di-nucleotides, tri-nucleotides. - :param ctrl_exon: The file containing control exons - :param logging_level: The level of information to display - """ - logging_def(Config.output, logging_level) - logging.debug(f"Loading {bed_orf}") - df_orf = load_bed(bed_orf) - logging.debug(f"Loading {bed_exon}") - df_exon = load_bed(bed_exon) - logging.debug("Concatenating load file and filtering on ctrl exons") - df = pd.concat([df_exon, df_orf], axis=0, ignore_index=True) - del df_exon - del df_orf - ctrl_exon_list = get_ctrl_exon(ctrl_exon) - df = df.loc[df.name.isin(ctrl_exon_list), :] - logging.debug(df.head()) - logging.debug("Creating Frequency table") - final_table = create_table(df) - logging.debug(final_table.head()) - logging.debug(f"Saving table to {Config.frequency_file}") - final_table.columns = ["id", "ft_type", "ft", "exon_id", "frequency"] - final_table.to_csv(Config.frequency_file, sep="\t")