Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hic
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LBMC
Hub
hic
Commits
49ad90d5
Verified
Commit
49ad90d5
authored
1 year ago
by
Mia Croiset
Browse files
Options
Downloads
Patches
Plain Diff
update build matrices function + delete manual sorting
parent
bf23c27b
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
bin/hicstuff_build_matrix.py
+23
-157
23 additions, 157 deletions
bin/hicstuff_build_matrix.py
modules/local/hicstuff/build_matrix.nf
+1
-1
1 addition, 1 deletion
modules/local/hicstuff/build_matrix.nf
with
24 additions
and
158 deletions
bin/hicstuff_build_matrix.py
+
23
−
157
View file @
49ad90d5
...
...
@@ -12,144 +12,7 @@ import hicstuff.io as hio
import
hicstuff.digest
as
hcd
from
Bio
import
SeqIO
import
hicstuff.log
as
hcl
def
pairs2matrix
(
pairs_file
,
mat_file
,
fragments_file
,
mat_fmt
=
"
graal
"
,
threads
=
1
,
tmp_dir
=
None
):
"""
Generate the matrix by counting the number of occurences of each
combination of restriction fragments in a pairs file.
Parameters
----------
pairs_file : str
Path to a Hi-C pairs file, with frag1 and frag2 columns added.
mat_file : str
Path where the matrix will be written.
fragments_file : str
Path to the fragments_list.txt file. Used to know total
matrix size in case some observations are not observed at the end.
mat_fmt : str
The format to use when writing the matrix. Can be graal or bg2 format.
threads : int
Number of threads to use in parallel.
tmp_dir : str
Temporary directory for sorting files. If None given, will use the system default.
"""
# Number of fragments is N lines in frag list - 1 for the header
n_frags
=
sum
(
1
for
line
in
open
(
fragments_file
,
"
r
"
))
-
1
frags
=
pd
.
read_csv
(
fragments_file
,
delimiter
=
"
\t
"
)
def
write_mat_entry
(
frag1
,
frag2
,
contacts
):
"""
Write a single sparse matrix entry in either graal or bg2 format
"""
if
mat_fmt
==
"
graal
"
:
mat
.
write
(
"
\t
"
.
join
(
map
(
str
,
[
frag1
,
frag2
,
n_occ
]))
+
"
\n
"
)
elif
mat_fmt
==
"
bg2
"
:
frag1
,
frag2
=
int
(
frag1
),
int
(
frag2
)
mat
.
write
(
"
\t
"
.
join
(
map
(
str
,
[
frags
.
chrom
[
frag1
],
frags
.
start_pos
[
frag1
],
frags
.
end_pos
[
frag1
],
frags
.
chrom
[
frag2
],
frags
.
start_pos
[
frag2
],
frags
.
end_pos
[
frag2
],
contacts
,
],
)
)
+
"
\n
"
)
pre_mat_file
=
mat_file
+
"
.pre.pairs
"
# hio.sort_pairs(
# pairs_file,
# pre_mat_file,
# keys=["frag1", "frag2"],
# threads=threads,
# tmp_dir=tmp_dir,
# )
header_size
=
len
(
hio
.
get_pairs_header
(
pre_mat_file
))
with
open
(
pre_mat_file
,
"
r
"
)
as
pairs
,
open
(
mat_file
,
"
w
"
)
as
mat
:
# Skip header lines
for
_
in
range
(
header_size
):
next
(
pairs
)
prev_pair
=
[
"
0
"
,
"
0
"
]
# Pairs identified by [frag1, frag2]
n_occ
=
0
# Number of occurences of each frag combination
n_nonzero
=
0
# Total number of nonzero matrix entries
n_pairs
=
0
# Total number of pairs entered in the matrix
pairs_reader
=
csv
.
reader
(
pairs
,
delimiter
=
"
\t
"
)
# First line contains nrows, ncols and number of nonzero entries.
# Number of nonzero entries is unknown for now
if
mat_fmt
==
"
graal
"
:
mat
.
write
(
"
\t
"
.
join
(
map
(
str
,
[
n_frags
,
n_frags
,
"
-
"
]))
+
"
\n
"
)
for
pair
in
pairs_reader
:
# Fragment ids are field 8 and 9
curr_pair
=
[
pair
[
7
],
pair
[
8
]]
# Increment number of occurences for fragment pair
if
prev_pair
==
curr_pair
:
n_occ
+=
1
# Write previous pair and start a new one
else
:
if
n_occ
>
0
:
write_mat_entry
(
prev_pair
[
0
],
prev_pair
[
1
],
n_occ
)
prev_pair
=
curr_pair
n_pairs
+=
n_occ
n_occ
=
1
n_nonzero
+=
1
# Write the last value
write_mat_entry
(
curr_pair
[
0
],
curr_pair
[
1
],
n_occ
)
n_nonzero
+=
1
n_pairs
+=
1
# Edit header line to fill number of nonzero entries inplace in graal header
if
mat_fmt
==
"
graal
"
:
with
open
(
mat_file
)
as
mat
,
open
(
pre_mat_file
,
"
w
"
)
as
tmp_mat
:
header
=
mat
.
readline
()
header
=
header
.
replace
(
"
-
"
,
str
(
n_nonzero
))
tmp_mat
.
write
(
header
)
st
.
copyfileobj
(
mat
,
tmp_mat
)
# Replace the matrix file with the one with corrected header
os
.
rename
(
pre_mat_file
,
mat_file
)
else
:
os
.
remove
(
pre_mat_file
)
logger
.
info
(
"
%d pairs used to build a contact map of %d bins with %d nonzero entries.
"
,
n_pairs
,
n_frags
,
n_nonzero
,
)
def
pairs2cool
(
pairs_file
,
cool_file
,
bins_file
):
"""
Make a cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
Parameters
----------
pairs_file : str
Path to the pairs file containing input contact data.
cool_file : str
Path to the output cool file name to generate.
bins_file : str
Path to the file containing genomic segmentation information. (fragments_list.txt).
"""
# Make bins file compatible with cooler cload
bins_tmp
=
bins_file
+
"
.cooler
"
bins
=
pd
.
read_csv
(
bins_file
,
sep
=
"
\t
"
,
usecols
=
[
1
,
2
,
3
],
skiprows
=
1
,
header
=
None
)
bins
.
to_csv
(
bins_tmp
,
sep
=
"
\t
"
,
header
=
False
,
index
=
False
)
cooler_cmd
=
"
cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5 {bins} {pairs} {cool}
"
cool_args
=
{
"
bins
"
:
bins_tmp
,
"
pairs
"
:
pairs_file
,
"
cool
"
:
cool_file
}
sp
.
call
(
cooler_cmd
.
format
(
**
cool_args
),
shell
=
True
)
os
.
remove
(
bins_tmp
)
import
hicstuff.pipeline
as
hp
if
__name__
==
"
__main__
"
:
parser
=
argparse
.
ArgumentParser
()
...
...
@@ -161,37 +24,40 @@ if __name__ == "__main__":
pairs
=
args
.
pairs
fragments_list
=
args
.
fragments
mat_f
orma
t
=
args
.
type
mat_f
m
t
=
args
.
type
mat_out
=
args
.
output
log_file
=
"
hicstuff_matrix.log
"
sys
.
stderr
=
open
(
"
hicstuff_matrix.log
"
,
"
wt
"
)
hcl
.
set_file_handler
(
log_file
)
# Log which pairs file is being used and how many pairs are listed
pairs_count
=
0
with
open
(
pairs
,
"
r
"
)
as
file
:
for
line
in
file
:
if
line
.
startswith
(
'
#
'
):
continue
else
:
pairs_count
+=
1
logger
.
info
(
"
Generating matrix from pairs file %s (%d pairs in the file)
"
,
pairs
,
pairs_count
)
# Build matrix from pairs.
if
mat_fmt
==
"
cool
"
:
# Log which pairs file is being used and how many pairs are listed
pairs_count
=
0
with
open
(
pairs
,
"
r
"
)
as
file
:
for
line
in
file
:
if
line
.
startswith
(
'
#
'
):
continue
else
:
pairs_count
+=
1
logger
.
info
(
"
Generating matrix from pairs file %s (%d pairs in the file)
"
,
pairs
,
pairs_count
)
if
mat_format
==
"
cool
"
:
# Name matrix file in .cool
cool_file
=
os
.
path
.
splitext
(
mat_out
)[
0
]
+
"
.cool
"
pairs2cool
(
pairs
,
cool_file
,
fragments_list
)
mat
=
os
.
path
.
splitext
(
mat_out
)[
0
]
+
"
.cool
"
hp
.
pairs2cool
(
pairs
,
mat
,
fragments_list
,
exclude
=
None
)
else
:
pairs2matrix
(
hp
.
pairs2matrix
(
pairs
,
mat_out
,
fragments_list
,
mat_fmt
=
mat_f
orma
t
,
mat_fmt
=
mat_f
m
t
,
threads
=
1
,
tmp_dir
=
None
,
)
This diff is collapsed.
Click to expand it.
modules/local/hicstuff/build_matrix.nf
+
1
−
1
View file @
49ad90d5
...
...
@@ -20,7 +20,7 @@ process BUILD_MATRIX {
"""
grep -e ^# ${idx_pairs} > ${args}.pre.pairs
grep -v ^# ${idx_pairs} | sort -S 2G -k8,8n -k9,9n --parallel=1 >> ${args}.pre.pairs
hicstuff_build_matrix.py -p ${
args}.pre.
pairs -f ${fragments_list} -t graal -o ${args}
hicstuff_build_matrix.py -p ${
idx_
pairs
}
-f ${fragments_list} -t graal -o ${args}
pairtools sort ${idx_pairs} --output ${idx_pairs}.gz --c1 chr1 --c2 chr2 --p1 pos1 --p2 pos2 --pt frag1
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment