-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathplink_dupvar_prioritize.py
35 lines (26 loc) · 1.74 KB
/
plink_dupvar_prioritize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/env python
import argparse
# Before running this script, generate *.dupvar and *.frq.counts files:
# 1) plink --bfile <PLINK file> --list-duplicate-vars
# 2) plink --bfile <PLINK file> --freq counts
# After running this script, you can exclude duplicated variant IDs with the highest missigness rate:
# 1) plink --bfile <PLINK file> --exclude <output> --keep-allele-order --make-bed --out <new PLINK file>
argparser = argparse.ArgumentParser(description = 'Selects variants with highest missigness from the duplicates (*.dupvar file).')
argparser.add_argument('-d', '--dupvar', metavar = 'file', dest = 'in_dupvar', type = str, required = True, help = '*.dupvar file generated by PLINK.')
argparser.add_argument('-c', '--counts', metavar = 'file', dest = 'in_counts', type = str, required = True, help = '*.frq.counts file generated by PLINK.')
argparser.add_argument('-o', '--output', metavar = 'file', dest = 'out_filename', type = str, required = True, help = 'Output file name with variant IDs.')
if __name__ == '__main__':
args = argparser.parse_args()
missingness = dict()
with open(args.in_counts, 'rt') as icounts:
header = icounts.readline().split()
for line in icounts:
fields = dict(zip(header, line.split()))
missingness[fields['SNP']] = int(fields['G0'])
with open(args.in_dupvar, 'rt') as idupvar, open(args.out_filename, 'wt') as ofile:
header = idupvar.readline().split()
for line in idupvar:
var_ids = line.strip().split('\t')[-1].split()
var_ids_sorted = sorted([(var_id, missingness[var_id]) for var_id in var_ids], key = lambda x: x[1], reverse = True)
for var_id in var_ids_sorted[:-1]:
ofile.write(var_id[0] + '\n')