forked from gminevich/cloudmap2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cloudmap2.py
584 lines (501 loc) · 22.6 KB
/
cloudmap2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
###############################################################################
# CloudMap2
#
# Bulked segregant mapping tool that generates maps of linked chromosomal
# regions with color-coded SNPs of interest. Also uses kernel density
# estimation on linked parental SNPs to predict position of most likely causal
# SNP(s).
###############################################################################
import argparse
from collections import OrderedDict
from itertools import islice
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
from scipy.stats import kde
from matplotlib.backends.backend_pdf import PdfPages
# Adjustable SNP ratios (adjust if your pool of mutants has been contaminated
# with WT). Values below this ratio (at mapping SNP positions only) will count
# as non-SNP variants.
permissive_ratio_HA_SNP = .1
# Values above the ratio below (in regions of linkage only) will count as
# parental SNP variants
permissive_ratio_Homoz_Parental_SNP = .6
read_depth_for_SNP_consideration = 10
alt_alleles_for_SNP_consideration = 2
# Sliding window parameter
window_size = 10
# Express this as a ratio rather than an absolute number to make the parameter
# independent of the chosen window_size.
permissible_ratio_count_in_window = 0.7
# Deals with cases where EMS mutation/random mutation in the parental strain
# exactly matches mapping strain SNP. Thus falsely interpreted as a mapping
# strain SNP in analysis.
spurious_mapping_snp_position_ratio_threshold = .6
# Whether to:
# plot all parental variants (false)
# or EMS variants (G/C—>A/T) only (true).
ems_flag = 'false'
# For purposes of making plots for paper figures, can label the known causal
# variant.
# hu80: cic-1 chr III:2,401,031
# ot785: lin-13 chr III:7,701,091
# causal_chromosome_flag = 'chrIII'
# causal_variant_position = '2401031'
max_x = 0
max_y = 0
def main(Homozygous_Bulked_Segregant_SNPs_w_Mapping_Strain_SNPs_Subtracted_file, Bulked_Segregant_Allele_Ratios_at_Mapping_Strain_SNP_Positions_file, ofile):
# ToDo: convert all hardcoded options to command line parameters
mapping_strain_positions_df = parse_mapping_strain_positions_df(
load_vcf_as_df(Bulked_Segregant_Allele_Ratios_at_Mapping_Strain_SNP_Positions_file)
)
parental_homozygous_snps_df = load_vcf_as_df(Homozygous_Bulked_Segregant_SNPs_w_Mapping_Strain_SNPs_Subtracted_file)
# ToDo: generalize this to handle all the different formats of chromosome
# names.
all_chromosomes = pd.unique(mapping_strain_positions_df.CHROM.ravel())
# Remove MtDNA from ndarray, right now only do it by index, but should
# likely also plot MtDNA since there could be causal mutations there.
all_chromosomes = np.delete(all_chromosomes, 4)
mapping_plot_pdf, fig = initiate_plot(ofile)
for chrom in all_chromosomes:
# Debug
print ("--------------------------------")
print ("chrom: ", chrom)
# Get longest region of linkage on each chromosome
max_window_start, max_window_end = \
longest_region_of_parental_linkage_via_mapping_snps(
mapping_strain_positions_df, chrom
)
# Get parental strain SNPs in longest mapping region on each chromosome
parental_strain_SNPs_in_longest_mapping_region = \
parental_strain_variants_in_longest_non_crossing_strain_subsegment(
parental_homozygous_snps_df,
max_window_start, max_window_end, chrom
)
# Calculate KDE on the parental strain SNPs within the mapping region
# Need at least 3 variants in order to use KDE
if len(parental_strain_SNPs_in_longest_mapping_region.index) >= 3:
kde_max_y, kde_max_x, xgrid, probablity_density_function = \
kernel_density_estimation(
parental_strain_SNPs_in_longest_mapping_region,
chrom
)
kde_output_plot(
xgrid, probablity_density_function,
chrom, parental_strain_SNPs_in_longest_mapping_region,
kde_max_x, kde_max_y
)
# If not enough SNPs to use KDE, plot them without kde.
elif 1 >= len(parental_strain_SNPs_in_longest_mapping_region.index) < 3:
minimal_snp_plot(
chrom, parental_strain_SNPs_in_longest_mapping_region
)
else:
minimal_snp_plot(
chrom, parental_strain_SNPs_in_longest_mapping_region
)
finish_plot(mapping_plot_pdf, fig)
def initiate_plot(ofile):
"""Set up plotting parameters."""
plt.style.use('ggplot')
mapping_plot_pdf = PdfPages(ofile)
fig = plt.figure(figsize=(11.69, 8.27), dpi=150)
return mapping_plot_pdf, fig
def finish_plot(mapping_plot_pdf=None, fig=None):
"""Finalize the plotting parameters and save."""
for ax in fig.axes:
plt.sca(ax)
plt.xticks(rotation=25)
plt.tight_layout(pad=.4, h_pad=.4, w_pad=.4)
plt.subplots_adjust(wspace=.3, hspace=.7)
mapping_plot_pdf.savefig(fig, pad_inches=.5, orientation='landscape')
mapping_plot_pdf.close()
def load_vcf_as_df(vcf_file):
""" Reads in a VCF, and converts key columns to a dataframe """
vcf_as_df = pd.read_csv(vcf_file, header='infer', comment='#', sep='\t')
vcf_as_df.columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
return vcf_as_df
def parse_mapping_strain_positions_df(mapping_strain_positions_df = None):
""" Parse the key elements of the vcf file """
vcf_read_group_data = mapping_strain_positions_df.SAMPLE.str.split(':')
vcf_allele_freq_data = vcf_read_group_data.str[1]
ref_allele_count = pd.to_numeric(vcf_allele_freq_data.str.split(',').str[0])
alt_allele_count = pd.to_numeric(vcf_allele_freq_data.str.split(',').str[1])
read_depth = vcf_read_group_data.str[2]
# ToDo: Change to ratio of alt/read depth (only for cases where
# read depth = alt+ref to account for sequencing error/noise).
# For now, just use alt/read depth.
ratio = (pd.to_numeric(alt_allele_count)) / (pd.to_numeric(read_depth))
parsed_mapping_strain_positions_df = pd.concat(
[mapping_strain_positions_df.CHROM,
mapping_strain_positions_df.POS,
alt_allele_count, ref_allele_count,
pd.to_numeric(read_depth), ratio],
axis=1
)
parsed_mapping_strain_positions_df.columns = [
'CHROM', 'POS',
'alt_allele_count', 'ref_allele_count',
'read_depth', 'ratio'
]
return parsed_mapping_strain_positions_df
def sliding_window(seq, window_size=10):
""" Returns a sliding window over data from the iterable """
iterable = iter(seq)
result = tuple(islice(iterable, window_size))
if len(result) == window_size:
yield result
for elem in iterable:
result = result[1:] + (elem,)
yield result
def longest_region_of_parental_linkage_via_mapping_snps(
mapping_strain_positions_df, current_chrom
):
"""
Calculates the longest region of parental linkage via analysis of SNP
ratios within a sliding window
"""
# either 0 ratio positions or cases where at least 2 alternate reads
mapping_strain_positions_df = mapping_strain_positions_df[
(mapping_strain_positions_df.alt_allele_count == 0)
| (mapping_strain_positions_df.alt_allele_count
> alt_alleles_for_SNP_consideration)
]
mapping_strain_positions_df = mapping_strain_positions_df.loc[
(mapping_strain_positions_df.CHROM == current_chrom)
& (mapping_strain_positions_df.read_depth
> read_depth_for_SNP_consideration)
]
# Convert DF to dictionary and find max consecutive interval
POS_ratio_DF = mapping_strain_positions_df[['POS','ratio']]
POS_ratio_dict = dict(zip(POS_ratio_DF.POS, POS_ratio_DF.ratio))
POS_ratio_dict_sorted = OrderedDict(sorted(POS_ratio_dict.items()))
current_run_windows_start = 0
current_run_windows_end = 0
# the first/leftmost position in the window that begins the longest
# interval below threshold
max_window_start = 0
# the last/rightmost position in the window that ends the longest interval
# below threshold
max_window_end = 0
consecutive_windows_below_threshold = 0
max_consecutive_windows_below_threshold = 0
# evaluate contents of each window
for window in sliding_window(POS_ratio_dict_sorted.items(), window_size):
ratios_accepted = ratios_rejected = 0
# store POS info of first element of window
current_window_start = window[0][0]
for pos, ratio in window:
# Discounts cases where parental mutations(EMS or random) exactly
# match at an HA position, treat these as neutral
if (consecutive_windows_below_threshold > 0) \
& (ratio > spurious_mapping_snp_position_ratio_threshold):
pass
elif ratio <.1:
# ratios of < .1 we count as if a 0 ratio, with allowance for
# sequencing error
ratios_accepted += 1
else:
ratios_rejected += 1
current_window_end = pos
if ratios_accepted > 0:
fraction_ratios_accepted = ratios_accepted / (ratios_accepted
+ ratios_rejected)
else:
# do not risk a division by zero
fraction_ratios_accepted = 0
# e.g. If 4/5 SNPs are below .1, that window is a run of 0 ratio
# positions and thus counted as "parental"
if fraction_ratios_accepted < permissible_ratio_count_in_window:
consecutive_windows_below_threshold = 0
else:
consecutive_windows_below_threshold += 1
if consecutive_windows_below_threshold == 1:
current_run_windows_start = current_window_start
current_run_windows_end = current_window_end
if consecutive_windows_below_threshold > max_consecutive_windows_below_threshold:
max_consecutive_windows_below_threshold = consecutive_windows_below_threshold
max_window_start = current_run_windows_start
max_window_end = pos
# Debug
print(
"max window: {:,} — {:,}"
.format(max_window_start, max_window_end)
)
print("max_consecutive_windows_below_threshold:",
max_consecutive_windows_below_threshold)
print(
"size of max window: {:,}"
.format(max_window_end - max_window_start)
)
return max_window_start, max_window_end
def parental_strain_variants_in_longest_non_crossing_strain_subsegment(
parental_homozygous_snps_df,
start_largest_consecutive=None, end_largest_consecutive=None,
current_chrom=None
):
"""
Identifies parental strain variants in the longest region devoid of crossing
strain snps
"""
# Split SAMPLE Column with each field as a new column
parental_homozygous_snps_read_group_data = \
parental_homozygous_snps_df.SAMPLE.str.split(':')
parental_homozygous_snps_allele_freq_data = \
parental_homozygous_snps_read_group_data.str[1]
parental_homozygous_ref_allele_count = \
parental_homozygous_snps_allele_freq_data.str.split(',').str[0]
parental_homozygous_ref_allele_count.name = 'ref_allele_count'
parental_homozygous_alt_allele_count = \
parental_homozygous_snps_allele_freq_data.str.split(',').str[1]
parental_homozygous_alt_allele_count.name = 'alt_allele_count'
parental_homozygous_read_depth = \
parental_homozygous_snps_read_group_data.str[2]
# Ratio of alt/read depth (only for cases where read depth = alt+ref).
# For now, just use alt/read depth
parental_homozygous_ratio = (
pd.to_numeric(parental_homozygous_alt_allele_count)
/pd.to_numeric(parental_homozygous_read_depth)
)
# Calculate EMS variants
parental_homozygous_snps_df['EMS'] = np.where(
np.logical_or(parental_homozygous_snps_df['REF']=='G',
parental_homozygous_snps_df['REF']=='C')
& np.logical_or(parental_homozygous_snps_df['ALT']=='A',
parental_homozygous_snps_df['ALT']=='T'),
'yes', 'no'
)
# Return DF of key VCF columns (header missing for newly created columns)
parental_homozygous_snps_df_ems_depth_calc = pd.concat(
[parental_homozygous_snps_df.EMS,
parental_homozygous_snps_df.CHROM,
parental_homozygous_snps_df.POS,
parental_homozygous_ratio],
axis=1
)
parental_homozygous_snps_df_ems_depth_calc.columns = [
'EMS', 'CHROM', 'POS', 'ratio'
]
# Get the linked chromosome. Currently there's a different chromosome
# naming convention in the homozygous parental variants file.
parental_homozygous_linked_chromosome = \
parental_homozygous_snps_df_ems_depth_calc[
(parental_homozygous_snps_df_ems_depth_calc.CHROM==current_chrom)
& (parental_homozygous_snps_df_ems_depth_calc.ratio
> permissive_ratio_Homoz_Parental_SNP)
]
# For testing/paper figure purposes plot all parental or just ems variants
if ems_flag == 'false':
# All parental variants (EMS and genetic drift)
parental_strain_SNPs_in_longest_mapping_region = \
parental_homozygous_linked_chromosome[
(parental_homozygous_linked_chromosome.POS
> start_largest_consecutive)
& (parental_homozygous_linked_chromosome.POS
< end_largest_consecutive)
]
elif ems_flag == 'true':
# Filtered for EMS variants
parental_strain_SNPs_in_longest_mapping_region = \
parental_homozygous_linked_chromosome[
(parental_homozygous_linked_chromosome.POS
> start_largest_consecutive)
& (parental_homozygous_linked_chromosome.POS
< end_largest_consecutive)
& (parental_homozygous_linked_chromosome.EMS =="yes")
]
# Debug
#if current_chrom == causal_chromosome_flag:
# parental_strain_SNPs_in_longest_mapping_region.to_csv(current_chrom+'_parental_variants_in_mapping_region.csv')
return parental_strain_SNPs_in_longest_mapping_region
def set_plot_axes(chrom=None, is_minimal_snp_plot=True):
""" Creates layout for each figure on the pdf, sets the x-axis """
chrom_length = 0
if chrom == 'chrI':
ax = plt.subplot2grid((3, 2), (0, 0))
if is_minimal_snp_plot:
chrom_length=16000000
if chrom == 'chrII':
ax = plt.subplot2grid((3, 2), (0, 1))
if is_minimal_snp_plot:
chrom_length=16000000
if chrom == 'chrIII':
ax = plt.subplot2grid((3, 2), (1, 0))
if is_minimal_snp_plot:
chrom_length=14000000
if chrom == 'chrIV':
ax = plt.subplot2grid((3, 2), (1, 1))
if is_minimal_snp_plot:
chrom_length=18000000
if chrom == 'chrV':
ax = plt.subplot2grid((3, 2), (2, 0))
if is_minimal_snp_plot:
chrom_length=21000000
if chrom == 'chrX':
ax = plt.subplot2grid((3, 2), (2, 1))
if is_minimal_snp_plot:
chrom_length=18000000
return ax, chrom_length
def kde_output_plot(
xgrid, probablity_density_function,
chrom, parental_strain_SNPs_in_longest_mapping_region,
kde_max_x, kde_max_y
):
"""
Calculates kernel density estimation on linked parental variants in
cases where have at least 3 variants.
"""
ax, _ = set_plot_axes(chrom, is_minimal_snp_plot=False)
plt.plot(xgrid, probablity_density_function, 'r-')
plt.title(chrom)
# Use/Remove scientific notation
# plt.ticklabel_format(style='plain', axis='y')
plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
# Add commas to x-axis
ax.get_xaxis().set_major_formatter(
tkr.FuncFormatter(lambda x, p: format(int(x), ','))
)
# Annotate the predicted position
ax.annotate(
("{:,}".format(round(kde_max_x))),
(kde_max_x, kde_max_y),
xytext=(0, -30),
textcoords='offset points',
arrowprops=dict(facecolor='black', shrink=0.05),
horizontalalignment='center',
verticalalignment='bottom'
)
# ToDo: remove?
# parental_strain_SNPs_in_longest_mapping_region_EMS = parental_homozygous_linked_chromosome[(parental_homozygous_linked_chromosome.POS > start_largest_consecutive) & (parental_homozygous_linked_chromosome.POS < end_largest_consecutive) & (parental_homozygous_linked_chromosome.EMS =="yes")]
parental_strain_SNPs_in_longest_mapping_region_EMS = \
parental_strain_SNPs_in_longest_mapping_region[
(parental_strain_SNPs_in_longest_mapping_region.EMS =="yes")
]
# Plot the scatter points on X axis
plt.plot(
parental_strain_SNPs_in_longest_mapping_region.POS,
np.zeros_like(parental_strain_SNPs_in_longest_mapping_region.POS)+0,
'o',
color='grey'
)
# Overplot EMS variants in a different color
plt.plot(
parental_strain_SNPs_in_longest_mapping_region_EMS.POS,
np.zeros_like(parental_strain_SNPs_in_longest_mapping_region_EMS.POS)+0,
'd',
color='blue'
)
# Debug
# For paper figures, can plot the causal variant so it stands out
#if chrom == causal_chromosome_flag:
# plt.plot(causal_variant_position, 0, 'd', color='red')
#if chrom == 'chrI':
# plt.plot(causal_variant_position_I, 0, 'd', color='red')
#if chrom == 'chrV':
# plt.plot(causal_variant_position_II, 0, 'd', color='red')
def minimal_snp_plot(
chrom, parental_strain_SNPs_in_longest_mapping_region=None
):
"""
For cases where no parental variants in a region of linkage (defined as
region devoid of crossing strain variants) or < 3 linked parental variants,
plot the positions without KDE b/c KDE on < 3 points doesn't work.
"""
# Debug
print("Plot performed on these SNPs: \n",
parental_strain_SNPs_in_longest_mapping_region)
ax, chrom_length = set_plot_axes(chrom, is_minimal_snp_plot=True)
# Add commas to x-axis
ax.get_xaxis().set_major_formatter(
tkr.FuncFormatter(lambda x, p: format(int(x), ','))
)
# Debug: Remove scientific formatting
#plt.ticklabel_format(style='plain', axis='x')
# different X-axis depending on the amount of snps
plt.axis([0, chrom_length, 0, 1])
if len(parental_strain_SNPs_in_longest_mapping_region.index) == 0:
#if 0 SNPs, then plot empty plot
plt.axis([0, chrom_length, 0, 1])
elif len(parental_strain_SNPs_in_longest_mapping_region.index) == 1:
# if 1 snp, set axes to 1000bp+/-
plt.axis(
[parental_strain_SNPs_in_longest_mapping_region.POS.min() - 1000,
parental_strain_SNPs_in_longest_mapping_region.POS.min() + 1000,
0, 1]
)
elif len(parental_strain_SNPs_in_longest_mapping_region.index) == 2:
# if 2 snps, set axes to 1000bp+/- each end
plt.axis(
[parental_strain_SNPs_in_longest_mapping_region.POS.min() - 1000,
parental_strain_SNPs_in_longest_mapping_region.POS.max() + 1000,
0, 1]
)
# Identify the subset of EMS SNPs
parental_strain_SNPs_in_longest_mapping_region_EMS = \
parental_strain_SNPs_in_longest_mapping_region[
(parental_strain_SNPs_in_longest_mapping_region.EMS =="yes")
]
# Plot 1-2 variants without KDE
if len(parental_strain_SNPs_in_longest_mapping_region.index) > 0:
for positions in parental_strain_SNPs_in_longest_mapping_region:
# plot non-EMS variants
plt.plot(
parental_strain_SNPs_in_longest_mapping_region.POS,
np.zeros_like(
parental_strain_SNPs_in_longest_mapping_region.POS
) + 0,
'o', color='grey'
)
# overplot EMS variants in a different color
plt.plot(
parental_strain_SNPs_in_longest_mapping_region_EMS.POS,
np.zeros_like(
parental_strain_SNPs_in_longest_mapping_region_EMS.POS
) + 0,
'd', color='blue'
)
plt.title(chrom)
def kernel_density_estimation(
parental_strain_SNPs_in_longest_mapping_region=None, chrom=None
):
"""
Perform kernel density estimation on a given set of linked parental
variants. scipy.stats module has automatic bandwidth determination:
"""
print("KDE performed on these SNPs: \n",
parental_strain_SNPs_in_longest_mapping_region)
kernel = kde.gaussian_kde(
parental_strain_SNPs_in_longest_mapping_region.POS
)
xgrid = np.linspace(
parental_strain_SNPs_in_longest_mapping_region.POS.min(),
parental_strain_SNPs_in_longest_mapping_region.POS.max()
)
probablity_density_function = kernel.evaluate(xgrid)
# Find the maximum y value and its corresponding x value.
max_y = max(probablity_density_function)
max_x = xgrid[probablity_density_function.argmax()]
# Print large numbers with commas
print(
"Predicted position of mutation: ",
("{:,}".format(round(max_x)))
)
return max_y, max_x, xgrid, probablity_density_function
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Map a causal variant and plot linkage evidence.'
)
parser.add_argument('Homozygous_Bulked_Segregant_SNPs_w_Mapping_Strain_SNPs_Subtracted_file', nargs='?',
metavar='<Homozygous Bulked Segregant SNPs with mapping strain SNPs subtracted vcf>',
help='VCF file with SNPs that appear homozygous '
'in the bulked segregants sample, but are '
'NOT found in the mapping strain.')
parser.add_argument('Bulked_Segregant_Allele_Ratios_at_Mapping_Strain_SNP_Positions_file', nargs='?',
metavar='<Bulked Segregant Allele Ratios at Mapping Strain SNP Positions vcf>',
help='VCF file with allele ratios found in the bulked segregants sample '
'at mapping strain SNP positions only.')
parser.add_argument('-o', '--ofile', required=True,
help="output file for the plot")
args = vars(parser.parse_args())
main(**args)