forked from jmcbroome/introduction-website
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprepare_us_states.py
50 lines (46 loc) · 2.34 KB
/
prepare_us_states.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# This script is a wrapper around the primary pipeline script.
# It does all necessary preprocessing of the global public SARS-CoV-2 phylogenetic tree
# (https://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/) and generates
# all files needed to calculate introductions into each U.S. state.
#
# Example command line usage:
# python3 prepare_us_states.py -i public-latest.all.masked.pb -m public-latest.metadata.tsv
# -a hu1.gb -l state_lexicon.txt -j us-states.geo.json -r 0
# -x “date,country,name,Nextstrain_clade_usher,pango_lineage_usher”
#--------------------------------------------------------------
import subprocess
from master_backend import primary_pipeline, parse_setup
from utils import read_lexicon
#--------------------------------------------------------------
#Step 1: Process metadata file
print("Processing metadata file")
args = parse_setup()
conversion = read_lexicon(args.lexicon)
pbf = args.input
subprocess.check_call("matUtils extract -i " + pbf + " -u samplenames.txt",shell=True)
badsamples = open("unlabeled_samples.txt","w+")
with open("samplenames.txt") as inf:
with open("sample_regions.tsv","w+") as outf:
for entry in inf:
country = entry.split("/")[0]
if country == "USA":
state = entry.split("/")[1].split("-")[0]
if state in conversion:
print(entry.strip() + "\t" + conversion[state.upper()], file = outf)
else:
print(entry.strip(), file = badsamples)
badsamples.close()
#--------------------------------------------------------------
#Step 2: Prune the protobuf file to include only US samples.
# Notes: this step is optional. Skip this step if you would like to create a global phylogenetic tree.
# If you decide to use the full global phylogenetic tree, we recommend setting up your own Taxonium
# backend, as very large trees may cause Taxonium to bog down.
print("Clearing out unparseable samples.")
subprocess.check_call("matUtils extract -i " + pbf + " -s unlabeled_samples.txt -p -o clean.pb", shell = True)
#--------------------------------------------------------------
#Step 3: Run the primary pipeline
#update the arguments parsed
args.input = "clean.pb" #comment this line out if you are skipping step 2.
args.sample_regions = "sample_regions.tsv"
print("Starting main pipeline.")
primary_pipeline(args)