forked from rghu/reconCNV
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gis_ballele_to_vcf.py
116 lines (99 loc) · 3.71 KB
/
gis_ballele_to_vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import json
import argparse
import csv
def main(ballele_file, vcf_file, rs_gene):
print("B-Allele:", ballele_file)
print("VCF Output:", vcf_file)
if rs_gene:
with open(rs_gene) as f:
rs_gene = json.loads(f.read())
else:
rs_gene = {}
with open(ballele_file) as ballele_handle, open(vcf_file, 'w') as vcf_handle:
ballele_reader = csv.DictReader(
ballele_handle,
fieldnames=["chrom", "pos", "rs", "freq"],
delimiter="\t"
)
vcf_handle.write("##fileformat=VCFv4.2\n")
vcf_handle.write("""##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths (counting only informative reads out of the total reads) for the ref and alt alleles in the order listed">\n""")
vcf_handle.write("""##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele fractions for alt alleles in the order listed">\n""")
vcf_handle.write("""##FORMAT=<ID=VF,Number=A,Type=Float,Description="Allele fractions for alt alleles in the order listed">\n""")
vcf_handle.write("""##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">\n""")
vcf_handle.write("#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample\n")
vcf_writer = csv.DictWriter(
vcf_handle,
fieldnames=["chrom","pos", "id", "ref", "alt", "qual", "filter", "info", "format", "sample"],
delimiter="\t"
)
for row in ballele_reader:
chrom = row["chrom"]
chrom = chrom.replace("chr", "")
pos = row["pos"]
rs = row["rs"]
if not rs.startswith("rs"):
continue
gene = rs_gene.get(rs, rs)
freq = row["freq"]
try:
freq = float(freq)
except Exception as e:
print(f"Could float({freq}):", e)
continue
ref_freq = 1 - freq
ref_dp = int(100 * ref_freq)
alt_dp = int(100 * freq)
dp = ref_dp + alt_dp
ref = "N"
alt = "N"
qual = "."
fltr = "PASS"
info = ";".join([
f"DP={dp}",
f"AF={freq}",
f"VF={freq}",
])
frmt = "AD:VF:AF"
sample = ":".join([
f"{ref_dp},{alt_dp}",
f"{freq}",
f"{freq}",
])
vcf_writer.writerow({
"chrom": chrom,
"pos": pos,
"id": gene,
"ref": ref,
"alt": alt,
"qual": qual,
"filter": fltr,
"info": info,
"format": frmt,
"sample": sample
})
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="Illumina Gis ballele file to a VCF that can be read by reconCNV",
description="A script to convert the TSO500 Logs_Intermediates/Gis/ to a reconCNV .VCF file",
)
parser.add_argument(
"--ballele",
help="The input <sample>_bAllele.csv",
required=True
)
parser.add_argument(
"--vcf",
help="The output .vcf ready to be used by reconCNV",
required=True
)
parser.add_argument(
"--rs-gene",
help="A json file with {rs#: gene}",
default=None
)
args = parser.parse_args()
ballele_file = args.ballele
vcf_file = args.vcf
rs_gene = args.rs_gene
main(ballele_file, vcf_file, rs_gene)