-
Notifications
You must be signed in to change notification settings - Fork 8
/
clusterbuster.py
332 lines (292 loc) · 14.2 KB
/
clusterbuster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
import io
import os
import subprocess
from typing import Dict, Optional, Tuple, Union
import numpy as np
import pandas as pd
def get_motif_id_to_filename_and_nbr_motifs_dict(
motifs_dir: str,
motifs_list_filename: str,
partial: Optional[Tuple[int, int]] = None,
min_nbr_motifs: Optional[int] = 1,
max_nbr_motifs: Optional[int] = None,
motif_md5_to_motif_id_filename: Optional[str] = None,
) -> (Dict[str, str], Dict[str, int]):
"""
Create motif ID to Cluster-Buster motif file and number of motifs per motif file mapping.
:param motifs_dir:
Directory with Cluster-Buster motif files (with motif MD5 name or motif ID motif files).
:param motifs_list_filename:
File with Cluster-Buster motif MD5 names or motif IDs.
:param partial: (current_part, nbr_total_parts)
Divide the motif list in a number of total parts (of similar size) and return only the part defined by
current_part. This makes it easier to create partial cisTarget motifs or tracks vs regions or genes scores
database on machines which do not have enough RAM to score all motifs in one iteration, while still being able
to give the same list of motifs to each instance.
:param min_nbr_motifs:
Only include motif IDs which have at least min_nbr_motifs of motifs in motif ID Cluster-Buster file.
This allows to easily score e.g clustered Cluster-Buster motif files with at least 2 motifs as the singletons
might already be scored before.
Default: 1.
:param max_nbr_motifs:
Only include motif IDs which have at maximum max_nbr_motifs of motifs in motif ID Cluster-Buster file.
Default: None.
:param motif_md5_to_motif_id_filename: TSV file with motif MD5 names to motif IDs mapping (optional).
:return: (motif_id_to_filename_dict, motif_id_to_nbr_motifs_dict):
motif ID to CLuster-Buster motif filename mapping.
motif ID to number of motifs in motif ID file mapping.
"""
motif_id_to_filename_dict = dict()
motif_id_to_nbr_motifs_dict = dict()
motif_md5_to_motif_id_dict = dict()
motif_id_to_motif_md5_dict = dict()
if not min_nbr_motifs:
min_nbr_motifs = 1
if motif_md5_to_motif_id_filename:
# Get motif MD5 name to motif ID mapping if motif_md5_to_motif_id_filename was provided.
with open(motif_md5_to_motif_id_filename, "r") as fh:
for line in fh:
line = line.rstrip()
if line and not line.startswith("#"):
motif_md5, motif_id = line.rstrip().split("\t")[0:2]
# Store motif MD5 name to motif ID mapping and vice versa.
motif_md5_to_motif_id_dict[motif_md5] = motif_id
motif_id_to_motif_md5_dict[motif_id] = motif_md5
# Create motif ID to Cluster-Buster motif filename mapping.
with open(motifs_list_filename, "r") as fh:
for line in fh:
motif_md5_or_id = line.rstrip()
if motif_md5_or_id and not motif_md5_or_id.startswith("#"):
if motif_md5_or_id.endswith(".cb"):
# Remove ".cb" extension from motif MD5 name or motif ID.
motif_md5_or_id = motif_md5_or_id[:-3]
if motif_md5_to_motif_id_dict:
# A motif_md5_to_motif_id_filename was provided, so assume Cluster-Buster motif filenames in
# motifs_dir have motif MD5 names.
if motif_md5_or_id in motif_md5_to_motif_id_dict:
# Get associated motif ID for motif MD5 name if a motif MD5 name was provided.
motif_id = motif_md5_to_motif_id_dict[motif_md5_or_id]
motif_md5 = motif_md5_or_id
elif motif_md5_or_id in motif_id_to_motif_md5_dict:
# Get associated motif MD5 name for motif ID if a motif ID was provided.
motif_id = motif_md5_or_id
motif_md5 = motif_id_to_motif_md5_dict[motif_md5_or_id]
else:
raise ValueError(
f'Error: Could not find motif MD5 name <=> motif ID association for "{motif_md5_or_id}".'
)
# Cluster-Buster motif MD5 name filename.
motif_filename = os.path.join(motifs_dir, motif_md5 + ".cb")
else:
# No motif_md5_to_motif_id_filename was provided, so assume Cluster-Buster motif filenames in
# motifs_dir have motif IDs.
motif_id = motif_md5_or_id
# Cluster-Buster motif ID filename.
motif_filename = os.path.join(motifs_dir, motif_id + ".cb")
if not os.path.exists(motif_filename):
raise OSError(
f'Error: Cluster-Buster motif filename "{motif_filename}" does not exist for motif {motif_id}.'
)
else:
# Number of motifs in current motif ID Cluster-Buster file.
nbr_motifs_for_motif_id = 0
with open(motif_filename, "r") as motif_fh:
# Count number of motifs in one motif file.
for motif_line in motif_fh:
if motif_line.startswith(">"):
nbr_motifs_for_motif_id += 1
# Filter out if motif ID Cluster-Buster file does have too little or too many motifs.
if nbr_motifs_for_motif_id >= min_nbr_motifs:
if max_nbr_motifs:
if nbr_motifs_for_motif_id <= max_nbr_motifs:
motif_id_to_filename_dict[motif_id] = motif_filename
motif_id_to_nbr_motifs_dict[
motif_id
] = nbr_motifs_for_motif_id
else:
motif_id_to_filename_dict[motif_id] = motif_filename
motif_id_to_nbr_motifs_dict[
motif_id
] = nbr_motifs_for_motif_id
# Sort motif IDs by number of motifs in motif ID CLuster-Buster file (high to low) and then by motif ID name, so
# motif IDs with a lot of motifs appear first, so the chance of having a few motif IDs with a lot of motifs only
# running after all other ones are finished, should be much lower.
motif_ids_sorted_by_nbr_motifs_for_motif_id = [
motif_id
for motif_id, nbr_motifs_for_motif_id in sorted(
motif_id_to_nbr_motifs_dict.items(),
key=lambda nbr_motifs_and_motif_id: (
-nbr_motifs_and_motif_id[1],
nbr_motifs_and_motif_id[0],
),
)
]
if partial:
current_part, nbr_total_parts = partial
if nbr_total_parts < 1:
raise ValueError(
f'"nbr_total_parts" ({nbr_total_parts}) of partial argument should be >= 1.'
)
if current_part < 1:
raise ValueError(
f'"current_part" ({current_part}) of partial argument should be >= 1.'
)
if current_part > nbr_total_parts:
raise ValueError(
f'"current_part" ({current_part}) of partial argument should be <= "nbr_total_parts" '
f"({nbr_total_parts})."
)
# Get partial motif IDs list for current requested part of the motif IDs.
# If this function is run with a different current_part number, each partial motif IDs list should have
# a similar number of motifs with a lot of motifs per motif ID Cluster-Buster file (in case a clustered motif
# collection was given).
partial_motif_ids_list = [
motif_ids_sorted_by_nbr_motifs_for_motif_id[i]
for i in range(
current_part - 1, len(motif_id_to_nbr_motifs_dict), nbr_total_parts
)
]
# Recreate dictionaries with subset (corresponding to current_part) of motif IDs sorted by number of motifs in
# motif ID Cluster-Buster file.
motif_id_to_filename_dict = {
motif_id: motif_id_to_filename_dict[motif_id]
for motif_id in partial_motif_ids_list
}
motif_id_to_nbr_motifs_dict = {
motif_id: motif_id_to_nbr_motifs_dict[motif_id]
for motif_id in partial_motif_ids_list
}
else:
# Recreate dictionaries with motif IDs sorted by number of motifs in motif ID Cluster-Buster file (high to low).
motif_id_to_filename_dict = {
motif_id: motif_id_to_filename_dict[motif_id]
for motif_id in motif_ids_sorted_by_nbr_motifs_for_motif_id
}
motif_id_to_nbr_motifs_dict = {
motif_id: motif_id_to_nbr_motifs_dict[motif_id]
for motif_id in motif_ids_sorted_by_nbr_motifs_for_motif_id
}
return motif_id_to_filename_dict, motif_id_to_nbr_motifs_dict
def run_cluster_buster_for_motif(
cluster_buster_path: str,
fasta_filename: str,
motif_filename: str,
motif_id: str,
extract_gene_id_from_region_id_regex_replace: Optional[str] = None,
bg_padding: int = 0,
mask: bool = False,
ssh_command: Optional[Union[str, list]] = None,
) -> Tuple[str, pd.DataFrame]:
"""
Score each sequence in the FASTA file with Cluster-Buster and only keep the top CRM score per region ID/gene ID.
:param cluster_buster_path: Path to Cluster-Buster binary.
:param fasta_filename: FASTA filename with regions to score.
:param motif_filename: Cluster-Buster motif filename which contains the motif to score all regions with.
:param motif_id: Motif ID.
:param extract_gene_id_from_region_id_regex_replace:
Define a regex which will remove the non-gene part of the region ID of each sequence
name in the FASTA file, so only the gene ID remains. If set to None the whole region ID
will be kept instead. In case of region IDs, the best CRM score per region is kept.
In case of gene IDs, the best CRM score from multiple regions is kept.
:param bg_padding: Use X bp at start and end of each sequence only for calculating the background
nucleotide frequency, but not for scoring the motif itself.
:param mask: Consider masked (lowercase) nucleotides as Ns.
:param ssh_command: If defined, run Cluster-Buster over ssh by running the provided command to make the
connection before running Cluster-Buster.
Example : 'ssh -o ControlMaster=auto -o ControlPath=/tmp/ssh-control-path-%l-%h-%p-%r -o ControlPersist=600 <hostname>'
:return: (motif_id, df_crm_scores): motif ID and dataframe with top CRM score per region/gene ID.
"""
clusterbuster_command = []
if ssh_command:
# Add SSH command to the start of the Cluster-Buster command.
if isinstance(ssh_command, str):
clusterbuster_command.extend(ssh_command.split())
elif isinstance(ssh_command, list):
clusterbuster_command.extend(ssh_command)
# Construct Cluster-Buster command line.
clusterbuster_command.extend(
[
cluster_buster_path,
"-f",
"4",
"-c",
"0.0",
"-r",
"10000",
"-b",
str(bg_padding),
"-t",
"1",
]
)
if mask:
clusterbuster_command.append("-l")
clusterbuster_command.extend([motif_filename, fasta_filename])
# Score each region in FASTA file with Cluster-Buster for the provided motif and get top CRM score for each region.
try:
pid = subprocess.Popen(
args=clusterbuster_command,
bufsize=-1,
executable=None,
stdin=None,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=None,
close_fds=False,
shell=False,
cwd=None,
env=None,
universal_newlines=False,
startupinfo=None,
creationflags=0,
)
stdout_data, stderr_data = pid.communicate()
except OSError as msg:
raise RuntimeError(
"Execution error for: '"
+ " ".join(clusterbuster_command)
+ "': "
+ str(msg)
)
if pid.returncode != 0:
raise RuntimeError(
"Error: Non-zero exit status for: '" + " ".join(clusterbuster_command) + "'"
)
# Read Cluster-Buster standard out as a pandas dataframe.
df_crm_scores = pd.read_csv(
filepath_or_buffer=io.BytesIO(stdout_data),
sep="\t",
header=0,
names=["seq_name", "crm_score"],
index_col="seq_name",
usecols=["seq_name", "crm_score"],
dtype={"seq_name": str, "crm_score": np.float32},
engine="c",
)
if extract_gene_id_from_region_id_regex_replace:
# Extract gene ID from the region ID by removing the non-gene part.
#
# Take the top CRM score for each gene ID by taking the maximum CRM
# score of all region IDs that belong to the same gene ID.
#
# Examples:
# - extract_gene_id_from_region_id_regex_replace = '#[0-9]+$'
# - region ID (input):
# "geneA#1", "geneB#1", "geneC#1", "geneA#2", "geneC#2"
# - gene IDs (output):
# "geneA", "geneB", "geneC"
# - extract_gene_id_from_region_id_regex_replace = '^.+@@'
# - region IDs (input):
# "region1@@geneA", "region2@@geneB", "region3@@geneA"
# - gene IDs (output):
# "geneA", "geneB"
df_crm_scores = (
df_crm_scores.assign(
gene_ids=df_crm_scores.index.str.replace(
extract_gene_id_from_region_id_regex_replace, "", regex=True
)
)
.groupby("gene_ids")
.max()
)
return motif_id, df_crm_scores