// HiC-Pro // Copyright 2015 Institut Curie // Author(s): Eric Viara // Contact: nicolas.servant@curie.fr // This software is distributed without any guarantee under the terms of the BSD-3 License #include <iostream> #include <iomanip> #include <fstream> #include <sstream> #include <unordered_map> #include <map> #include <vector> #include <stdlib.h> #include <string.h> #include <assert.h> #include <math.h> #include <fcntl.h> #include <unistd.h> #include <sys/stat.h> static const int SPARSE_FMT = 0x1; static const int BED_FMT = 0x2; static const char* prog; static bool progress = false; static bool detail_progress = false; static bool quiet = false; static bool NO_DICHO = getenv("NO_DICHO") != NULL; typedef unsigned int chrsize_t; const std::string VERSION = "1.2 [2015-10-20]"; const static chrsize_t BIN_NOT_FOUND = (chrsize_t)-1; class AxisChromosome; static bool is_empty_line(const char* buffer) { while (char c = *buffer++) { if (c != ' ' || c != '\n' || c != '\t') { return false; } } return true; } static int bed_line_parse(char* buffer, char chr[], chrsize_t& start, chrsize_t& end, const std::string& bedfile, size_t line_num) { if (sscanf(buffer, "%s %u %u", chr, &start, &end) != 3) { std::cerr << "bed file \"" << bedfile << "\" at line #" << line_num << " format error\n"; return 1; } return 0; } struct Interval { chrsize_t start; chrsize_t end; Interval(chrsize_t start = 0, chrsize_t end = 0) : start(start), end(end) { } }; class ChrRegions { std::vector<std::string> chr_v; std::map<std::string, std::vector<Interval>* > intervals; public: ChrRegions() { } int readBedfile(const std::string& bedfile) { std::ifstream ifs(bedfile.c_str()); if (ifs.bad() || ifs.fail()) { std::cerr << prog << " cannot open bed file: " << bedfile << " for reading\n"; return 1; } char buffer[4096]; size_t line_num = 0; chrsize_t lastend = 0; char lastchr[2048] = {0}; while (!ifs.eof()) { ifs.getline(buffer, sizeof(buffer)-1); line_num++; if (is_empty_line(buffer)) { continue; } chrsize_t start = 0; chrsize_t end = 0; char chr[2048]; if (bed_line_parse(buffer, chr, start, end, bedfile, line_num)) { return 1; } if (intervals.find(chr) == intervals.end()) { intervals[chr] = new std::vector<Interval>(); chr_v.push_back(chr); } /* if (lastend != 0 && !strcmp(lastchr, chr) && start != lastend) { std::cerr << "warning: discontinuous segment for chromosome " << chr << " at position " << start << " " << end << std::endl; } */ if (*lastchr && strcmp(lastchr, chr)) { lastend = 0; } if (lastend != 0 && start < lastend) { std::cerr << "error: bedfile not sorted at line #" << line_num << std::endl; exit(1); } strcpy(lastchr, chr); lastend = end; intervals[chr]->push_back(Interval(start, end)); if (progress && (line_num % 100000) == 0) { std::cerr << '.' << std::flush; } } if (progress) { std::cerr << std::endl; } return 0; } void displayBed(std::ostream& ofs, const std::vector<AxisChromosome*>& axis_chr) const { std::vector<std::string>::const_iterator begin = chr_v.begin(); std::vector<std::string>::const_iterator end = chr_v.end(); unsigned int num = 1; while (begin != end) { const std::string& chrname = *begin; std::map<std::string, std::vector<Interval>* >::const_iterator iter = intervals.find(chrname); assert(iter != intervals.end()); const std::vector<Interval>* itv_vect = (*iter).second; std::vector<Interval>::const_iterator itv_begin = itv_vect->begin(); std::vector<Interval>::const_iterator itv_end = itv_vect->end(); while (itv_begin != itv_end) { const Interval& itv = (*itv_begin); ofs << chrname << '\t' << itv.start << '\t' << itv.end << '\t' << num << '\n'; if (progress && (num % 100000) == 0) { std::cerr << '.' << std::flush; } num++; ++itv_begin; } ++begin; } if (progress) { std::cerr << std::endl; } } const std::vector<Interval>* getIntervalsFromChr(const std::string& chr) const { std::map<std::string, std::vector<Interval>* >::const_iterator iter = intervals.find(chr); if (iter != intervals.end()) { return (*iter).second; } return NULL; } }; class Dichotomic { int min, max; const std::vector<Interval>& intervals; public: Dichotomic(const std::vector<Interval>& intervals) : intervals(intervals) { //min = middle(intervals[0]); //max = middle(intervals[intervals.size()-1]); min = 0; max = intervals.size()-1; } static chrsize_t middle(const Interval& itv) { return (itv.start+1 + itv.end) / 2; } int find(chrsize_t value) { int l = min; int r = max; int n = 0; while (l <= r) { n = (l + r) >> 1; const Interval& itv = intervals[n]; if (value >= itv.start+1 && value <= itv.end) { return n; } int x = middle(itv) - value; if (x < 0) { l = n + 1; } else { r = n - 1; } //std::cout << "l: " << l << '\n'; //std::cout << "r: " << r << '\n'; } return -1; } }; class Chromosome { private: static std::unordered_map<std::string, Chromosome*> chr_map; void computeSizes(chrsize_t ori_binsize, chrsize_t step, bool binadjust, const ChrRegions* chr_regions); std::string name; chrsize_t chrsize; chrsize_t binsize; chrsize_t stepsize; chrsize_t bincount; const ChrRegions* chr_regions; public: Chromosome(const std::string& name, chrsize_t chrsize, chrsize_t ori_binsize, chrsize_t step, bool binadjust, const ChrRegions* chr_regions) : name(name), chrsize(chrsize), chr_regions(chr_regions) { computeSizes(ori_binsize, step, binadjust, chr_regions); assert(chr_map.find(name) == chr_map.end()); chr_map[name] = this; } void adjustBinsize(chrsize_t ori_binsize, const chrsize_t step); const std::string& getName() const {return name;} chrsize_t getChrsize() const {return chrsize;} chrsize_t getBinsize() const {return binsize;} chrsize_t getStepsize() const {return stepsize;} chrsize_t getBincount() const {return bincount;} const ChrRegions* getChrRegions() const {return chr_regions;} static chrsize_t getCount() { return chr_map.size(); } static Chromosome* getByName(const std::string& name) { return chr_map[name]; } }; class AxisChromosome { int idx; // really needed ? const Chromosome* chr; chrsize_t binstart; chrsize_t binend; public: AxisChromosome(int binoffset, const Chromosome* chr, const AxisChromosome* lastAxisChr) : chr(chr) { if (lastAxisChr != NULL) { binstart = lastAxisChr->getBinend(); } else { binstart = binoffset; } binend = binstart + chr->getBincount(); /* if (verbose) { std::cerr << "AxisChromosome: " << chr->getName() << " " << binstart << " " << binend << " " << chr->getBincount() << std::endl; } */ } chrsize_t getBinstart() const {return binstart;} chrsize_t getBinend() const {return binend;} chrsize_t getChrsize() const {return chr->getChrsize();} chrsize_t getBinsize() const {return chr->getBinsize();} chrsize_t getStepsize() const {return chr->getStepsize();} chrsize_t getBincount() const {return chr->getBincount();} const Chromosome* getChromosome() const {return chr;} chrsize_t assign_bin(const std::string& org, chrsize_t start) const { const ChrRegions* chr_regions = chr->getChrRegions(); if (chr_regions != NULL) { const std::vector<Interval>* intervals = chr_regions->getIntervalsFromChr(chr->getName()); assert(intervals != NULL); if (!NO_DICHO) { Dichotomic dicho(*intervals); int where = dicho.find(start); if (where < 0) { if (!quiet) { std::cerr << "warning: no bin at position " << chr->getName() << ":" << start << std::endl; } return BIN_NOT_FOUND; } return where + getBinstart(); } std::vector<Interval>::const_iterator begin = intervals->begin(); std::vector<Interval>::const_iterator end = intervals->end(); chrsize_t binidx = 1; while (begin != end) { const Interval& itv = *begin; if (start >= itv.start+1 && start <= itv.end) { break; } ++binidx; ++begin; } return binidx + getBinstart() - 1; } int loc = (int)start; int binsize = getBinsize(); int stepsize = getStepsize(); int cur_binidx = 1 + ceil((double)(loc-binsize)/stepsize); int cur_binbeg = stepsize * (cur_binidx-1)+1; int cur_binend = cur_binbeg + binsize-1; int chrsize = getChrsize(); if (cur_binend > chrsize) { cur_binend = chrsize; } return cur_binidx + getBinstart() - 1; } }; class Matrix { std::vector<AxisChromosome*> axis_chr_abs; std::vector<AxisChromosome*> axis_chr_ord; std::unordered_map<std::string, AxisChromosome*> axis_chr_abs_map; std::unordered_map<std::string, AxisChromosome*> axis_chr_ord_map; std::map<chrsize_t, std::map<chrsize_t, chrsize_t> > mat; void addAxisChromosome(const std::vector<const Chromosome*>& chr_v, std::vector<AxisChromosome*>& axis_chr, std::unordered_map<std::string, AxisChromosome*>& axis_chr_map); const AxisChromosome* getAxisChromosome(const std::string& chrname, const std::unordered_map<std::string, AxisChromosome*>& axis_chr_map) const { std::unordered_map<std::string, AxisChromosome*>::const_iterator iter = axis_chr_map.find(chrname); if (iter == axis_chr_map.end()) { return NULL; } return (*iter).second; } void displayBed(std::ostream& ofs, const std::vector<AxisChromosome*>& axis_chr) const { std::vector<AxisChromosome*>::const_iterator begin = axis_chr.begin(); std::vector<AxisChromosome*>::const_iterator end = axis_chr.end(); while (begin != end) { const AxisChromosome* axis_chr = *begin; const std::string& name = axis_chr->getChromosome()->getName(); chrsize_t binstart = axis_chr->getBinstart(); chrsize_t binend = axis_chr->getBinend(); chrsize_t binsize = axis_chr->getBinsize(); chrsize_t chrsize = axis_chr->getChrsize(); binend -= binstart; for (chrsize_t bin = 0; bin < binend; ++bin) { // bed are 0-based begin, 1-based end chrsize_t beg = bin * binsize; chrsize_t end = beg + binsize - 1; if (end > chrsize) { end = chrsize-1; } ofs << name << '\t' << beg << '\t' << (end+1) << '\t' << (bin+binstart) << '\n'; } ++begin; } } int binoffset; public: Matrix(int binoffset) : binoffset(binoffset) {} void addXAxisChromosome(const std::vector<const Chromosome*>& chr_v); void addYAxisChromosome(const std::vector<const Chromosome*>& chr_v); const AxisChromosome* getXAxisChromosome(const std::string& chrname) const { return getAxisChromosome(chrname, axis_chr_abs_map); } const AxisChromosome* getYAxisChromosome(const std::string& chrname) const { return getAxisChromosome(chrname, axis_chr_ord_map); } void add(chrsize_t abs_bin, chrsize_t ord_bin) { std::map<chrsize_t, std::map<chrsize_t, chrsize_t> >::iterator iter = mat.find(abs_bin); if (iter == mat.end()) { mat[abs_bin] = std::map<chrsize_t, chrsize_t>(); mat[abs_bin][ord_bin] = 1; } else { (*iter).second[ord_bin]++; } } void displayMatrix(std::ostream& ofs) const { std::map<chrsize_t, std::map<chrsize_t, chrsize_t> >::const_iterator begin = mat.begin(); std::map<chrsize_t, std::map<chrsize_t, chrsize_t> >::const_iterator end = mat.end(); size_t line_total = 0; if (progress) { while (begin != end) { const std::map<chrsize_t, chrsize_t>& line = (*begin).second; line_total += line.size(); ++begin; } begin = mat.begin(); } size_t line_cnt = 1; if (progress) { std::cerr << "\n=================\n"; std::cerr << " Dumping matrix\n"; std::cerr << "=================\n\n"; } size_t modulo = line_total / 1000; while (begin != end) { chrsize_t abs = (*begin).first; const std::map<chrsize_t, chrsize_t>& line = (*begin).second; std::map<chrsize_t, chrsize_t>::const_iterator bb = line.begin(); std::map<chrsize_t, chrsize_t>::const_iterator ee = line.end(); while (bb != ee) { if (progress && (line_cnt % modulo) == 0) { double percent = (double(line_cnt)/line_total)*100; std::cerr << "" << percent << "% " << line_cnt << " / " << line_total << std::endl; } ofs << abs << '\t' << (*bb).first << '\t' << (*bb).second << '\n'; line_cnt++; ++bb; } ++begin; } } void displayXBed(std::ostream& ofs) const { displayBed(ofs, axis_chr_abs); } void displayYBed(std::ostream& ofs) const { displayBed(ofs, axis_chr_ord); } const std::vector<AxisChromosome*>& getXAxisChromosomes() {return axis_chr_abs;} const std::vector<AxisChromosome*>& getYAxisChromosomes() {return axis_chr_ord;} }; void Matrix::addAxisChromosome(const std::vector<const Chromosome*>& chr_v, std::vector<AxisChromosome*>& axis_chr, std::unordered_map<std::string, AxisChromosome*>& axis_chr_map) { std::vector<const Chromosome*>::const_iterator begin = chr_v.begin(); std::vector<const Chromosome*>::const_iterator end = chr_v.end(); const AxisChromosome* lastAxisChr = NULL; while (begin != end) { const Chromosome* chr = *begin; AxisChromosome* axisChr = new AxisChromosome(binoffset, chr, lastAxisChr); axis_chr.push_back(axisChr); axis_chr_map[chr->getName()] = axisChr; lastAxisChr = axisChr; ++begin; } } void Matrix::addXAxisChromosome(const std::vector<const Chromosome*>& chr_v) { addAxisChromosome(chr_v, axis_chr_abs, axis_chr_abs_map); } void Matrix::addYAxisChromosome(const std::vector<const Chromosome*>& chr_v) { addAxisChromosome(chr_v, axis_chr_ord, axis_chr_ord_map); } std::unordered_map<std::string, Chromosome*> Chromosome::chr_map; enum Format { SPARSE_IND_FMT = SPARSE_FMT, SPARSE_BED_FMT = SPARSE_FMT|BED_FMT, EXPANDED_FMT = 0x4 }; void Chromosome::adjustBinsize(chrsize_t ori_binsize, const chrsize_t step) { bincount = 1 + (chrsize_t)floor( (double)(chrsize-ori_binsize) / (ori_binsize/step)); binsize = chrsize / bincount; stepsize = binsize / step; } void Chromosome::computeSizes(chrsize_t ori_binsize, chrsize_t step, bool binadjust, const ChrRegions* chr_regions) { if (NULL != chr_regions) { const std::vector<Interval>* intervals = chr_regions->getIntervalsFromChr(name); assert(intervals != NULL); bincount = intervals->size(); /* if (verbose) { std::cerr << name << " bincount: " << bincount << std::endl; } */ } else { if (chrsize < ori_binsize) { binsize = chrsize; stepsize = chrsize; bincount = 1; } else if (binadjust) { adjustBinsize(ori_binsize, step); } else { binsize = ori_binsize; stepsize = (chrsize_t)floor(ori_binsize/step); chrsize_t remainder = (chrsize - ori_binsize) % stepsize; chrsize_t tmp_bincount = 1 + (chrsize_t)floor(chrsize-ori_binsize)/stepsize; bincount = remainder > 0 ? tmp_bincount+1 : tmp_bincount; } /* if (verbose) { std::cerr << name << " sizes: " << chrsize << " " << binsize << " " << stepsize << " " << bincount << std::endl; } */ } } static int usage(int ret = 1) { std::cerr << "\nusage: " << prog << " --binsize BINSIZE|--binfile --chrsizes FILE --ifile FILE\n"; std::cerr << " --oprefix PREFIX [--binadjust] [--step STEP] [--binoffset OFFSET]\n"; std::cerr << " [--matrix-format asis|upper|lower|complete][--chrA CHR... --chrB CHR...] [--quiet] [--progress] [--detail-progress]\n"; std::cerr << "\nusage: " << prog << " --version\n"; std::cerr << "\nusage: " << prog << " --help\n"; return ret; } static int help() { (void)usage(); std::cerr << "\nOPTIONS\n\n"; std::cerr << " --version : display version\n"; std::cerr << " --binsize BINSIZE : bin size\n"; std::cerr << " --binfile BEDFILE : bed file containing bins (chr start end)\n"; std::cerr << " --chrsizes FILE : file containing chromosome sizes\n"; std::cerr << " --ifile FILE : input interaction file\n"; std::cerr << " --oprefix PREFIX : output prefix of generated files (matrix and bed)\n"; std::cerr << " --binadjust : [optional] adjust bin sizes, default is false\n"; std::cerr << " --step STEP : [optional] step size, default is 1\n"; std::cerr << " --binoffset OFFSET : [optional] starting bin offset, default is 1\n"; std::cerr << " --matrix-format FORMAT : [optional] FORMAT may be:\n"; std::cerr << " - asis: matrix is generated according to input data (default)\n"; std::cerr << " - upper: only the upper matrix is generated\n"; std::cerr << " - lower: only the lower matrix is generated\n"; std::cerr << " - complete: generate both parts of the matrix (upper and lower);\n"; std::cerr << " input data must contain only one part (upper or lower) \n"; std::cerr << " --chrA CHR : [optional] colon separated list of abscissa chromosomes; default is all chromosomes\n"; std::cerr << " --chrB CHR : [optional] colon separated list of ordinate chromosomes; default is all chromosomes\n"; std::cerr << " --quiet : do not display any warning\n"; std::cerr << " --progress : display progress\n"; std::cerr << " --detail-progress : display detail progress (needs preliminary steps consuming time)\n"; return -1; } enum MatrixFormat { ASIS_MATRIX = 1, UPPER_MATRIX, LOWER_MATRIX, COMPLETE_MATRIX }; static int get_options(int argc, char* argv[], chrsize_t& binsize, const char*& binfile, const char*& chrsize_file, const char*& ifile, const char*& oprefix, Format& format, std::string& bed_prefix, bool& binadjust, MatrixFormat& matrix_format, chrsize_t& step, bool& whole_genome, int& binoffset, const char*& chrA, const char*& chrB) { prog = argv[0]; for (int ac = 1; ac < argc; ++ac) { const char* opt = argv[ac]; if (*opt == '-') { if (!strcmp(opt, "--binadjust")) { binadjust = true; } else if (!strcmp(opt, "--version")) { std::cout << "build_matrix version " << VERSION << "\n"; exit(0); } else if (!strcmp(opt, "--progress")) { progress = true; } else if (!strcmp(opt, "--quiet")) { quiet = true; } else if (!strcmp(opt, "--detail-progress")) { progress = true; detail_progress = true; } else if (!strcmp(opt, "--matrix-format")) { if (ac == argc-1) { return usage(); } std::string matrix_format_str = argv[++ac]; if (matrix_format_str == "asis") { matrix_format = ASIS_MATRIX; } else if (matrix_format_str == "upper") { matrix_format = UPPER_MATRIX; } else if (matrix_format_str == "lower") { matrix_format = LOWER_MATRIX; } else if (matrix_format_str == "complete") { matrix_format = COMPLETE_MATRIX; } else { return usage(); } } else if (!strcmp(opt, "--step")) { if (ac == argc-1) { return usage(); } step = atoi(argv[++ac]); } else if (!strcmp(opt, "--binfile")) { if (ac == argc-1) { return usage(); } binfile = argv[++ac]; } else if (!strcmp(opt, "--binsize")) { if (ac == argc-1) { return usage(); } binsize = atoi(argv[++ac]); } else if (!strcmp(opt, "--binoffset")) { if (ac == argc-1) { return usage(); } binoffset = atoi(argv[++ac]); } else if (!strcmp(opt, "--ifile")) { if (ac == argc-1) { return usage(); } ifile = argv[++ac]; } else if (!strcmp(opt, "--oprefix")) { if (ac == argc-1) { return usage(); } oprefix = argv[++ac]; } else if (!strcmp(opt, "--chrsizes")) { if (ac == argc-1) { return usage(); } chrsize_file = argv[++ac]; } else if (!strcmp(opt, "--chrA")) { if (ac == argc-1) { return usage(); } chrA = argv[++ac]; whole_genome = false; } else if (!strcmp(opt, "--chrB")) { if (ac == argc-1) { return usage(); } chrB = argv[++ac]; whole_genome = false; } else if (!strcmp(opt, "--help")) { return help(); } else { std::cerr << '\n' << prog << ": unknown option " << opt << std::endl; return usage(); } } } return 0; } static void split_in_vect(const std::string& str, std::vector<const Chromosome*>& vect) { size_t last_pos = 0; while (size_t pos = str.find(':', last_pos)) { std::string chrname; bool last = pos == std::string::npos; if (last) { chrname = str.substr(last_pos); } else { chrname = str.substr(last_pos, pos-last_pos); } const Chromosome* chr = Chromosome::getByName(chrname); if (!chr) { std::cerr << prog << ": unknown chromosome " << chrname << std::endl; exit(1); } vect.push_back(chr); if (last) { break; } last_pos = pos+1; } } static int interaction_parse(char* buffer, char*& lchr, chrsize_t& lstart, char*& rchr, chrsize_t& rstart) { char c; char* str; while ((c = *buffer++) != 0) { if (c == '\t') { lchr = buffer; break; } } while ((c = *buffer) != 0) { if (c == '\t') { *buffer++ = 0; str = buffer; break; } buffer++; } while ((c = *buffer) != 0) { if (c == '\t') { *buffer++ = 0; lstart = atoi(str); break; } buffer++; } while ((c = *buffer++) != 0) { if (c == '\t') { rchr = buffer; break; } } while ((c = *buffer) != 0) { if (c == '\t') { *buffer++ = 0; str = buffer; break; } buffer++; } while ((c = *buffer) != 0) { if (c == '\t') { *buffer++ = 0; rstart = atoi(str); break; } buffer++; } return 0; } static char p_buffer[512000]; static int build_matrix_init(Matrix& matrix, const char* ifile, std::ifstream& ifs, const std::string& oprefix, std::ofstream& matfs, std::ofstream& xbedfs, std::ofstream& ybedfs, const char* chrsize_file, bool whole_genome, const char* chrA, const char* chrB, chrsize_t ori_binsize, const char* binfile, chrsize_t step, bool binadjust, ChrRegions*& chr_regions, size_t& line_total) { ifs.open(ifile); if (ifs.bad() || ifs.fail()) { std::cerr << prog << " cannot open interaction file: " << ifile << " for reading\n"; return 1; } if (detail_progress) { if (progress) { std::cerr << "\n======================================\n"; std::cerr << " Getting information for progress bar\n"; std::cerr << "======================================\n\n"; } std::cerr << std::setprecision(2) << std::fixed; int fd = open(ifile, O_RDONLY); struct stat st; assert(fstat(fd, &st) == 0); assert(fd >= 0); int nn; int cnt = 1; while ((nn = read(fd, p_buffer, sizeof(p_buffer))) > 0) { const char *p = p_buffer; while (nn-- > 0) { if (*p++ == '\n') { line_total++; } } if ((cnt % 200) == 0) { std::cerr << '.' << std::flush; } cnt++; } std::cerr << std::endl; close(fd); } std::ifstream chrsizefs; chrsizefs.open(chrsize_file); if (chrsizefs.bad() || chrsizefs.fail()) { std::cerr << prog << " cannot open chrsizes file: " << chrsize_file << " for reading\n"; return 1; } std::string matfile = oprefix + ".matrix"; matfs.open(matfile); if (matfs.bad() || matfs.fail()) { std::cerr << prog << " cannot open file: " << matfile << " for writing\n"; return 1; } std::string xbedfile = oprefix + "_abs.bed"; xbedfs.open(xbedfile); if (xbedfs.bad() || xbedfs.fail()) { std::cerr << prog << " cannot open file: " << xbedfile << " for writing\n"; return 1; } std::string ybedfile = oprefix + "_ord.bed"; if (!whole_genome) { //std::string xbedlink; //size_t pos = xbedfile.rfind('/'); //if (pos != std::string::npos) { // xbedlink = xbedfile.substr(pos+1); //} else { // xbedlink = xbedfile; //} //unlink(ybedfile.c_str()); //if (symlink(xbedlink.c_str(), ybedfile.c_str())) { // std::cerr << prog << " cannot created link: " << ybedfile << "\n"; // return 1; //} //} else { ybedfs.open(ybedfile); if (ybedfs.bad() || ybedfs.fail()) { std::cerr << prog << " cannot open file: " << ybedfile << " for writing\n"; return 1; } } chr_regions = NULL; if (NULL != binfile) { chr_regions = new ChrRegions(); if (progress) { std::cerr << "\n=================\n"; std::cerr << " Reading binfile\n"; std::cerr << "=================\n\n"; } if (chr_regions->readBedfile(binfile)) { return 1; } } std::vector<const Chromosome*> all_chr_v; while (!chrsizefs.eof()) { std::string buffer; getline(chrsizefs, buffer); chrsize_t chrsize; std::istringstream istr(buffer); std::string name; istr >> name >> chrsize; if (!istr.fail()) { Chromosome* chromosome = new Chromosome(name, chrsize, ori_binsize, step, binadjust, chr_regions); all_chr_v.push_back(chromosome); } } chrsizefs.close(); if (chrA) { assert(chrB != NULL); std::vector<const Chromosome*> chrA_v; std::vector<const Chromosome*> chrB_v; split_in_vect(chrA, chrA_v); split_in_vect(chrB, chrB_v); matrix.addXAxisChromosome(chrA_v); matrix.addYAxisChromosome(chrB_v); } else { matrix.addXAxisChromosome(all_chr_v); matrix.addYAxisChromosome(all_chr_v); } return 0; } static int build_matrix(int binoffset, chrsize_t ori_binsize, const char* binfile, const char* chrsize_file, const char* ifile, const char* oprefix, Format _dummy_format, const std::string& _dummy_bed_prefix, bool binadjust, MatrixFormat matrix_format, chrsize_t step, bool whole_genome, const char* chrA, const char* chrB) { std::ifstream ifs; std::ofstream matfs, xbedfs, ybedfs; Matrix matrix(binoffset); ChrRegions *chr_regions = NULL; size_t line_total = 0; if (int ret = build_matrix_init(matrix, ifile, ifs, oprefix, matfs, xbedfs, ybedfs, chrsize_file, whole_genome, chrA, chrB, ori_binsize, binfile, step, binadjust, chr_regions, line_total)) { return ret; } if (progress) { std::cerr << "\n=================\n"; std::cerr << " Building matrix\n"; std::cerr << "=================\n\n"; } size_t line_cnt = 1; size_t line_num = 0; char buffer[4096]; std::string lmark, rmark, lorg, rorg; while (!ifs.eof()) { ifs.getline(buffer, sizeof(buffer)-1); line_num++; if (is_empty_line(buffer)) { continue; } chrsize_t lstart = 0; chrsize_t rstart = 0; char* lchr = NULL; char* rchr = NULL; interaction_parse(buffer, lchr, lstart, rchr, rstart); const AxisChromosome* abs_chr = matrix.getXAxisChromosome(lchr); if (!abs_chr) { continue; } const AxisChromosome* ord_chr = matrix.getYAxisChromosome(rchr); if (!ord_chr) { continue; } chrsize_t abs_bin = abs_chr->assign_bin(lorg, lstart); if (abs_bin == BIN_NOT_FOUND) { continue; } chrsize_t ord_bin = ord_chr->assign_bin(rorg, rstart); if (ord_bin == BIN_NOT_FOUND) { continue; } switch(matrix_format) { case ASIS_MATRIX: matrix.add(abs_bin, ord_bin); break; case UPPER_MATRIX: if (abs_bin < ord_bin) { matrix.add(abs_bin, ord_bin); } else { matrix.add(ord_bin, abs_bin); } break; case LOWER_MATRIX: if (abs_bin > ord_bin) { matrix.add(abs_bin, ord_bin); } else { matrix.add(ord_bin, abs_bin); } break; case COMPLETE_MATRIX: matrix.add(abs_bin, ord_bin); if (abs_bin != ord_bin) { matrix.add(ord_bin, abs_bin); } break; } line_cnt++; if (progress && (line_cnt % 100000) == 0) { if (detail_progress) { double percent = (double(line_cnt)/line_total)*100; std::cerr << "" << percent << "% " << line_cnt << " / " << line_total << std::endl; } else { std::cerr << line_cnt << std::endl; } } } if (progress) { std::cerr << "\n==================\n"; std::cerr << " Dumping bedfiles\n"; std::cerr << "==================\n\n"; } if (NULL != chr_regions) { chr_regions->displayBed(xbedfs, matrix.getXAxisChromosomes()); if (!whole_genome) { chr_regions->displayBed(ybedfs, matrix.getYAxisChromosomes()); } } else { matrix.displayXBed(xbedfs); if (!whole_genome) { matrix.displayYBed(ybedfs); } } matrix.displayMatrix(matfs); xbedfs.close(); ybedfs.close(); matfs.close(); return 0; } int main(int argc, char* argv[]) { chrsize_t step = 1; bool binadjust = false; MatrixFormat matrix_format = ASIS_MATRIX; chrsize_t binsize = 0; const char* ifile = NULL; const char* oprefix = NULL; const char* chrA = NULL; const char* chrB = NULL; const char* chrsize_file = NULL; const char* binfile = NULL; bool whole_genome = true; int binoffset = 1; std::string bed_prefix; Format format = SPARSE_BED_FMT; if (int ret = get_options(argc, argv, binsize, binfile, chrsize_file, ifile, oprefix, format, bed_prefix, binadjust, matrix_format, step, whole_genome, binoffset, chrA, chrB)) { if (ret < 0) { return 0; } return ret; } if (!binsize && !binfile) { std::cerr << '\n'; std::cerr << prog << ": missing --binsize or --binfile option\n"; return usage(); } if (!chrsize_file) { std::cerr << '\n'; std::cerr << prog << ": missing --chrsizes option\n"; return usage(); } if (!ifile) { std::cerr << '\n'; std::cerr << prog << ": missing --ifile option\n"; return usage(); } if (!oprefix) { std::cerr << '\n'; std::cerr << prog << ": missing --oprefix option\n"; return usage(); } if ((chrA && !chrB) || (!chrA && chrB)) { std::cerr << '\n'; std::cerr << prog << ": options --chrA and --chrB must be set simultanously\n"; return usage(); } if (binfile && binsize) { std::cerr << '\n'; std::cerr << prog << ": options --binfile and --binsize cannot be set simultanously\n"; return usage(); } return build_matrix(binoffset, binsize, binfile, chrsize_file, ifile, oprefix, format, bed_prefix, binadjust, matrix_format, step, whole_genome, chrA, chrB); }