forked from naumanjaved/fingerprint_maps
-
Notifications
You must be signed in to change notification settings - Fork 0
/
map_building_functions.py
571 lines (483 loc) · 18.5 KB
/
map_building_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
import subprocess
import os
import itertools as it
import sys
import argparse
import traceback
import time
import vcf
def filter_VCFs(chrom, VCF_file, int_dir, min_MAF):
'''
Extract SNPs from each chromosome that are:
1) biallelic
2) are phased
Parameters
----------
chrom : string
chromosome number
VCF_file : string
path to 1000 genomes VCF file
int_dir : string
directory where intermediate files are stored
min_MAF : float
filter out all SNPs with population averaged MAF less than this value
Returns
-------
None
'''
name_prefix = int_dir + "chr_" + chrom
command = "vcftools --vcf " + VCF_file + " " \
+ "--maf " + str(min_MAF) + " " \
+ "--phased " \
+ "--remove-indels " \
+ "--min-alleles 2 --max-alleles 2 " \
+ "--recode --recode-INFO-all " \
+ "--out " + name_prefix
subprocess.call(command, shell=True)
def extract_similar_SNPs(chrom, int_dir, SIM, min_MAF):
'''
Writes to a new file the SNPs in an input VCF with population specific
minor allele fractions(MAFs) that do not differ by more than SIM. For
example,this function would reject a SNP with EUR_AF - AMR_AF = 0.15 if
SIM = 0.1.
Parameters
----------
chrom : string
chromosome number
VCF_directory : string
directory where original VCFs from 1000genomes phase 3 are stored
int_dir : string
directory where intermediate files are stored
SIM : float
similarity value - the maximum pairwise difference in population
specific MAF above which a SNP is rejected
Returns
-------
None
'''
VCF_file = int_dir + 'chr_' + chrom + '.recode.vcf'
vcf_reader = vcf.Reader(open(VCF_file, 'r')) # Loop over VCF records
sim_SNPs_file = int_dir + "chr_" + chrom + "-common-SNPs.list"
similar_SNPs = open(sim_SNPs_file, 'w')
for index, record in enumerate(vcf_reader):
# store record number
info = record.INFO
SNP = str(record.ID)
if isinstance(SNP,basestring):
SNP = 'chr' + str(chrom) + ':' + str(index)
values = [] # List to hold population allele frequencies(AFs)
differences = [] # List to hold pairwise AF differences
for field in info.keys(): # Load population AFs
if '_AF' in field:
values.append(float(info[field][0]))
differences = [abs(y-x) for x, y in it.combinations(values, 2)]
if max(differences) > SIM or len(values) != 5 or min(values) < 0.10:
continue # If similarity threshold is exceeded, skip SNP
similar_SNPs.write(str(index) + '-' + SNP + '\n')
similar_SNPs.close()
def keep_common_SNPs(chrom, int_dir):
'''
Create VCF containing SNPs identified by extract_similar_SNPs fxn
Parameters
----------
chrom : string
chromosome number
int_dir : string
directory path where intermediate files are stored
Returns
-------
None
'''
sim_SNPs_file = \
open(int_dir + "chr_" + chrom + "-common-SNPs.list", 'r')
VCF_file = \
open(int_dir + "chr_" + chrom + ".recode.vcf", 'r')
output_file = \
open(int_dir + "chr_" + chrom + ".common.recode.vcf", 'w')
SNP_dict = {}
VCF_lines = VCF_file.readlines()
VCF_snps = []
for line in VCF_lines:
if '#' not in line.split('\t')[0]:
VCF_snps.append(line)
else:
output_file.write(line)
del VCF_lines
for line in sim_SNPs_file:
snp_num, name = line.rstrip('\n').split('-')
snp_num = int(snp_num)
SNP_dict[snp_num] = name
sim_SNPs_file.close()
for entry in sorted(SNP_dict):
newline = VCF_snps[int(entry)].split('\t')
newline[2] = SNP_dict[entry]
newline = '\t'.join([str(x) for x in newline])
output_file.write(newline)
VCF_file.close()
output_file.close()
def create_PLINK_binary(chrom, int_dir, recomb_file):
'''
Creates PLINK binary files from input VCF for use with
LDSC script(https://github.com/bulik/ldsc).
Parameters
----------
chrom : string
chromosome number
VCFs_directory : string
directory path where original VCFs from 1000genomes phase 3 are stored
int_dir : string
directory path where intermediate files are stored
recomb_file : string
1000genomes recombination map file
Returns
-------
None
'''
# Use recombination maps to write plink binary files with centimorgans
command = "plink --vcf " \
+ int_dir + "chr_" + chrom + ".common.recode.vcf" \
+ " --cm-map " + recomb_file + " " + chrom \
+ " --make-bed" \
+ " --out " + int_dir + chrom
subprocess.call(command, shell=True)
def LD_score(chrom, int_dir, LD_script, window_cm):
'''
Calculates LDscore of all variants for each chromosome
using LDSC script(https://github.com/bulik/ldsc)
Parameters
----------
chrom : string
chromosome number
int_dir : string
directory path where intermediate files are stored
LD_script : string
directory containing python LD script
window_cm : float
for autosomes, the window in centimorgans over which to calculate LD
scores
Returns
-------
None
'''
command = "python " + LD_script \
+ " --bfile " + int_dir + chrom \
+ " --ld-wind-cm " + str(window_cm) \
+ " --out " + int_dir + "LD-" + chrom \
+ " --l2 --yes-really"
subprocess.call(command, shell=True)
def prune(chrom, int_dir, window, slide, cutoff):
'''
Take in all SNPs found on the chromosome and prune them to a set of
independent variants. PLINK looks at all pairs of variants within an X
variant window. If a pair has an R^2 correlation greater than some
threshold, then one SNP within this pair is removed. After pruning within
this window, PLINK slides along the chromosome by some number of SNPs.
Parameters
----------
chrom : string
chromosome number
int_dir : string
directory path where intermediate files are stored
window : int
size of window in kb over which to compute all pairwise correlations
slide : int
number of SNPs to slide over after each iteration
cutoff : float
maximum R^2 correlation between two SNPs, above which one SNP in the
pair is pruned.
Returns
-------
None
'''
prune_params = str(window) + " " \
+ str(slide) + " " \
+ str(cutoff)
command = "plink --bfile " + int_dir + chrom \
+ " --indep-pairwise " + prune_params \
+ " --r" \
+ " --out " + int_dir + chrom
if chrom == 'X':
command = "plink --bfile " + int_dir + chrom \
+ " --indep-pairwise " + prune_params \
+ " --r" \
+ " --ld-xchr 1" \
+ " --out " + int_dir + chrom
subprocess.call(command, shell=True)
def order(chrom, int_dir):
'''
Here we reassign LDscore_i_new = (1.0 - LD_score_i / max(LD_score)).
This gives the highest scoring SNP the lowest p value, and thus the
greatest priority when clumping.
Parameters
----------
chrom : string
chromosome number
int_dir : string
directory path where intermediate files are stored
LD_file : string
path of LD score file
Returns
-------
None
'''
association_file_name = int_dir + "LD-" + chrom + ".l2.ldscore"
# unzip LD score files
subprocess.call("gzip -d " + association_file_name + ".gz", shell=True)
assoc = open(association_file_name, 'r')
# write a new association file to rank SNPs by LDscore
p_file = open(int_dir + chrom + ".p", 'w')
p_file.write("SNP" + "\t" + 'P\n')
p_dictionary = {} # Create a SNP:LDscore dictionary
size_dict = {}
for line in assoc.readlines()[1:]:
SNP = line.split('\t')[1]
LD = float(line.split('\t')[3].rstrip('\n'))
p_dictionary[SNP] = LD
assoc.close()
# Calculate the max LDscore for the chromosome
max_LD = float(max(p_dictionary.values()))
'''
For each SNP in the sorted dictionary, map the LDscore
to LD_score_new_i = (1.0 - LD_score_i / max(LD_score))
Add 1e-6 to set the minimum "signifigance value"
'''
for SNP in sorted(p_dictionary, key=p_dictionary.get, reverse=True):
p_val = (1.0 - float(p_dictionary[SNP])/(max_LD)) + 0.000001
if p_val < 1.0:
p_file.write(SNP + '\t' + str(p_val) + '\n')
p_file.close()
def LD_separate(chrom, int_dir):
'''
Read in the association files created by order function
and separate it out into independent(pruned.in) SNPs and
dependent(pruned.out) SNPs.
Parameters
----------
chrom : string
chromosome number
int_dir : string
directory path where intermediate files are stored
Returns
-------
None
'''
LD_file = open(int_dir + chrom + ".p", 'r')
prune_file = open(int_dir + chrom + ".prune.in", 'r')
in_file = open(int_dir + chrom + "_in.p", 'w')
in_file.write('SNP' + '\t' + 'P\n')
out_file = open(int_dir + chrom + "_out.p", 'w')
out_file.write('SNP' + '\t' + 'P\n')
out_SNPs = []
index_SNPs = []
with prune_file as prune:
for k, line in enumerate(prune):
index_SNPs.append(line.rstrip('\n'))
with LD_file as LD:
for k, line in enumerate(LD):
if k > 0:
SNP = line.rstrip('\n').split('\t')[0]
if SNP in index_SNPs:
in_file.write(line)
else:
out_file.write(line)
in_file.close()
out_file.close()
def clump(chrom, int_dir, cutoff, max_size):
'''
Clumps dependent variants(_out.p) around independent variants(_in.p)..Each
SNP "clumped" around an index variant should be within a certain window
size(specified with clump-kb parameter) and have an r^2 correlation of
atleast "cutoff"(specified with --clump-r2) with the index variant. p1 and p2
represent the upper thresholds of signifigance, above which SNPs should be
excluded. Here they are set to 1 to include all SNPs.
Parameters
----------
chrom : string
chromosome number
int_dir : string
directory path where intermediate are stored
cutoff : float
minimum r^2 correlation required to become part of a clumped
max_size : int
max distance in kb a SNP can be from an index variant in order to be
included in a clump
Returns
-------
None
'''
command = "plink --bfile " + int_dir + chrom \
+ " --clump " + int_dir + chrom + "_in.p " \
+ int_dir + chrom + "_out.p " \
+ "--clump-index-first " \
+ "--clump-p1 1.0 " \
+ "--clump-p2 1.0 " \
+ "--clump-r2 " + str(cutoff) + " " \
+ "--clump-kb " + str(max_size) + " " \
+ "--out " + int_dir + chrom
subprocess.call(command, shell=True)
def reformat_clumps(chrom, int_dir):
'''
Script to read in clumps and write them into a haplotype map file readable
by picard. This file format has recently been deprecated in favor of a VCF
format file so this should be updated at some point.
Parameters
----------
chrom : string
chromosome number
int_dir : string
directory path where intermediate files are stored
Returns
-------
None
'''
# Sort the clumps by base pair position of index variant
command = "tail -n+2 " + int_dir + chrom + ".clumped" \
+ " | sort -k4,4 > " + int_dir + chrom \
+ "_sorted.clumped"
subprocess.call(command, shell=True)
block_dict = {} # Block dictionary with variant:index variant pairs
anchor_file = open(int_dir + chrom + "_sorted.clumped", 'r')
anchors = []
for line in anchor_file.readlines()[2:]:
anchor = line.rstrip('\n').split()[2]
anchors.append(anchor)
block_dict[anchor] = ""
snps = line.rstrip('\n').split()[11].split(',')
for snp in snps:
if snp.rstrip('(w)') not in anchors:
variant = snp.split('(')[0]
block_dict[variant] = anchor
anchor_file.close()
bim_dict = {}
# Read in plink .bim file to recover major and minor alleles
with open(int_dir + chrom + '.bim') as bim_file:
for line in bim_file:
line = line.rstrip('\n')
entries = line.split('\t')
variant = entries[1]
minor = entries[4]
major = entries[5]
bim_dict[variant] = (minor, major)
old_map = vcf.Reader(open(int_dir + "chr_" + chrom + ".common.recode.vcf", 'r'))
new_map = open(int_dir + chrom + ".map", 'w')
'''
parse each line in old vcf file to extract the SNP name, base
pair position, minor and major alleles, MAF and look up its index
variant in the block dictionary defined above. If the variant IS
an index variant then write its index variant as ''
'''
for record in old_map:
if record.ID in block_dict.keys():
ref = record.REF[0]
alt = record.ALT[0]
maf = record.INFO['AF'][0]
minor, major = bim_dict[record.ID]
switched_maf = str(maf)
if str(alt) == str(major):
switched_maf = str(1.0 - float(maf))
ref = major
alt = minor
newline = str(record.CHROM) + '\t' \
+ str(record.POS) + '\t' \
+ str(record.ID) + '\t' \
+ str(ref) + '\t' \
+ str(alt) + '\t' \
+ str(switched_maf) + '\t' \
+ str(block_dict[record.ID]) \
+ '\t' + "" + '\t' + '\n'
new_map.write(newline)
new_map.close()
def detect_negative_LD(chrom, int_dir):
'''
Reads in data from clumping procedure and writes out pairs of SNPs in the
newly created map file that are in high negative LD that give a high r^2
correlation.
Parameters
----------
chrom : string
chromosome number
int_dir : string
directory path where intermediate and output files are stored
Returns
-------
None
'''
r_file = open(int_dir + chrom + ".ld", 'r')
r_dict = {} # Create a dictionary of SNPs:r^2 pairs
with r_file as r:
for k, line in enumerate(r):
if k > 0:
r_dict[(line.split()[2], line.split()[5])] = \
line.split()[6].rstrip('\n')
'''
Loop over the newly created map file and for each
pair of SNP/anchor SNP, determine whether the
the pair's r^2 correlation is negative by checking
its value in the dictionary above. If it is, write
the pair to a new ".negLD" file
'''
map_file = open(int_dir + chrom + ".map", 'r')
neg_LD_file_name = int_dir + chrom + ".negLD"
with map_file as maps:
for num, line in enumerate(maps):
if len(line.split('\t')) > 6:
snp1 = line.split('\t')[2]
snp2 = line.split('\t')[6]
if (snp1, snp2) in r_dict.keys():
if float(r_dict[(snp1, snp2)]) < 0.0:
with open(neg_LD_file_name, 'a') as negLDfile:
negLDfile.write(snp1 + '\t' + snp2 + '\t'
+ r_dict[(snp1, snp2)] + '\n')
del r_dict[(snp1, snp2)]
if (snp2, snp1) in r_dict.keys():
if float(r_dict[(snp2, snp1)]) < 0.0:
with open(neg_LD_file_name, 'a') as negLDfile:
negLDfile.write(snp1 + '\t' + snp2 + '\t'
+ r_dict[(snp2, snp1)] + '\n')
del r_dict[(snp2, snp1)]
def switch_alleles(chrom, int_dir, out_dir):
'''
For each chromosomal map file, read in the pairs of SNPs found to be in
negative LD, switch the major and minor alleles in the map file, and
switch the MAF to 1-MAF.Keep original map file and write the new map file
to *.filtered.map
Parameters
----------
chrom : string
chromosome number
int_dir : string
directory path where intermediate files are stored
Returns
-------
None
'''
# If no pairs were in negative linkage, copy to output folder and skip
if not os.path.isfile(int_dir + chrom + ".negLD"):
command = "cp " + int_dir + chrom + ".map " \
+ out_dir + chrom + ".filtered.map"
subprocess.call(command, shell=True)
return None
tab = '\t'
negfile = open(int_dir + chrom + ".negLD", 'r')
neglist = []
with negfile as neg:
for line in neg:
neglist.append(line.split()[0])
newmapfile =open(out_dir + chrom + ".filtered.map", 'w')
oldmapfile = open(int_dir + chrom + ".map", 'r')
with oldmapfile as old:
for k, line in enumerate(old):
chrom =line.split('\t')[0]
bp = line.split('\t')[1]
maf = line.split('\t')[5]
snp = line.split('\t')[2]
minor = line.split('\t')[4]
maj = line.split('\t')[3]
anch = line.split('\t')[6]
# Switch alleles and replace maf with 1-maf if found in the list of SNPs in negative LD
if snp in neglist:
newline = [chrom, bp, snp, minor, maj, str(1.0 - float(maf)), anch, "", "\n"]
newmapfile.write(tab.join(newline))
else:
newmapfile.write(line)
newmapfile.close()