-
Notifications
You must be signed in to change notification settings - Fork 2
/
Snakefile
156 lines (134 loc) · 3.64 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
genes = [
"S",
"ORF1a",
"ORF1b",
"ORF3a",
"E",
"M",
"N",
"ORF6",
"ORF7a",
"ORF7b",
"ORF8",
"ORF9b",
]
rule all:
input:
commit_touchfile="commit.done",
# "upload.done",
shell:
"""
rm {input.commit_touchfile}
"""
rule get_consensus:
output:
"build/pango-consensus-sequences_genome-nuc_unsorted.fasta",
params:
command="scp -q [email protected]:~/nextclade_gisaid/sars-cov-2/pre-processed/synthetic.fasta"
if config.get("local", False)
else "cp ~/nextclade_gisaid/sars-cov-2/pre-processed/synthetic.fasta",
shell:
# To be adjusted if repos move
"""
{params.command} {output}
"""
rule sort_sequences:
input:
"build/pango-consensus-sequences_genome-nuc_unsorted.fasta",
output:
"build/pango-consensus-sequences_genome-nuc.fasta",
shell:
"seqkit sort -N {input} >{output}"
rule run_nextclade:
input:
"build/pango-consensus-sequences_genome-nuc.fasta",
output:
tsv="build/nextclade.tsv",
shell:
"nextclade run --in-order -d sars-cov-2 --output-all build {input}"
rule download_open_metadata:
output:
"build/open_metadata.tsv.zst",
shell:
"""
aws s3 cp --no-progress s3://nextstrain-data/files/ncov/open/metadata.tsv.zst {output}
"""
rule find_open_lineages:
"""
Publish consensus sequences only if there are 3 or more open genomes
"""
input:
rules.download_open_metadata.output,
output:
"build/open_lineages.txt",
shell:
"""
zstdcat {input} | \
tsv-summarize -H --group-by Nextclade_pango --count | \
tsv-filter -H --ge 'count:3' | \
tsv-select -H -f1 >{output}
"""
rule compress_nuc:
input:
fasta="build/pango-consensus-sequences_genome-nuc.fasta",
open="build/open_lineages.txt",
output:
"data/pango-consensus-sequences_genome-nuc.fasta.zst",
shell:
"""
seqkit grep -f {input.open} {input.fasta} | \
zstd -f --ultra -21 >{output}
"""
rule compress_translations:
input:
fasta="build/nextclade_gene_{gene}.translation.fasta",
open="build/open_lineages.txt",
output:
"data/pango-consensus-sequences_{gene}-aa.fasta.zst",
shell:
"""
seqkit grep -f {input.open} {input.fasta} | \
zstd -f --ultra -21 >{output}
"""
rule create_json:
input:
nextclade_tsv="build/nextclade.tsv",
output:
"data/pango-consensus-sequences_summary.json",
shell:
"""
python scripts/create_json.py \
--nextclade-tsv {input.nextclade_tsv} \
--output {output}
"""
rule commit_results:
input:
"data/pango-consensus-sequences_summary.json",
"data/pango-consensus-sequences_genome-nuc.fasta.zst",
expand("data/pango-consensus-sequences_{gene}-aa.fasta.zst", gene=genes),
output:
"commit.done",
params:
add="git add" if not config.get("test", False) else "#",
commit="git commit -m 'Automatic data update'; git push"
if not config.get("test", False)
else "",
shell:
"""
{params.add} {input}
git status
{params.commit}
touch {output}
"""
# rule upload_to_s3:
# """
# Upload of uncompressed files to S3
# All fastas and json
# """
# input:
# output:
# "upload.done",
# shell:
# """
# aws s3 sync data/ s3://nextstrain-data/files/pango-designation/
# """